├── .gitignore ├── .pylintrc ├── LICENSE ├── PG ├── 1-REINFORCE │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py ├── 2-Actor-Critic │ ├── config.py │ ├── model.py │ └── train.py ├── 3-Advantage-Actor-Critic │ ├── config.py │ ├── model.py │ └── train.py ├── 4-GAE │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py ├── 5-TNPG │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py ├── 6-TRPO │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py └── 7-PPO │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py ├── POMDP ├── 0-DQN │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py ├── 1-DRQN │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py ├── 2-DRQN-Stack │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py ├── 3-DRQN-Store-State │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py └── 4-R2D2-Single │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py ├── README.md ├── distributional ├── 1-QR-DQN │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py └── 2-IQN │ ├── config.py │ ├── memory.py │ ├── model.py │ └── train.py ├── parallel ├── 1-Async-Q-Learning │ ├── config.py │ ├── memory.py │ ├── model.py │ ├── shared_adam.py │ ├── train.py │ └── worker.py ├── 2-A3C │ ├── config.py │ ├── memory.py │ ├── model.py │ ├── shared_adam.py │ ├── train.py │ └── worker.py ├── 3-ACER │ ├── config.py │ ├── memory.py │ ├── model.py │ ├── shared_adam.py │ ├── train.py │ └── worker.py └── 5-ApeX │ ├── config.py │ ├── memory.py │ ├── model.py │ ├── train.py │ └── worker.py └── rainbow ├── 1-dqn ├── config.py ├── memory.py ├── model.py └── train.py ├── 2-DoubleDQN ├── README-KR.md ├── config.py ├── memory.py ├── model.py └── train.py ├── 3-DuelDQN ├── README-KR.md ├── Screenshot2018-11-1519-f88e4bf3-f581-4b24-a07e-af467a4bba64.14.23.png ├── config.py ├── memory.py ├── model.py └── train.py ├── 4-multistep ├── config.py ├── memory.py ├── model.py └── train.py ├── 5-per ├── README-KR.md ├── Screenshot2018-11-1514-a431e580-fd9d-4a07-afd1-5f80e0042c23.45.16.png ├── config.py ├── memory.py ├── model.py └── train.py ├── 6-Nosiy_net ├── README-KR.md ├── Screenshot2018-11-1616-fd936286-4e40-4962-99ff-1ddd3b7deeb8.36.21.png ├── config.py ├── memory.py ├── model.py └── train.py ├── 7-distributional_c51 ├── config.py ├── memory.py ├── model.py └── train.py ├── 8-Not_Distributional ├── README-KR.md ├── config.py ├── memory.py ├── model.py └── train.py └── 9-Rainbow ├── README-KR.md ├── config.py ├── memory.py ├── model.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/macos,python 3 | # Edit at https://www.gitignore.io/?templates=macos,python 4 | 5 | ### macOS ### 6 | # General 7 | .DS_Store 8 | .AppleDouble 9 | .LSOverride 10 | 11 | # Icon must end with two \r 12 | Icon 13 | 14 | # Thumbnails 15 | ._* 16 | 17 | # Files that might appear in the root of a volume 18 | .DocumentRevisions-V100 19 | .fseventsd 20 | .Spotlight-V100 21 | .TemporaryItems 22 | .Trashes 23 | .VolumeIcon.icns 24 | .com.apple.timemachine.donotpresent 25 | 26 | # Directories potentially created on remote AFP share 27 | .AppleDB 28 | .AppleDesktop 29 | Network Trash Folder 30 | Temporary Items 31 | .apdisk 32 | 33 | ### Python ### 34 | # Byte-compiled / optimized / DLL files 35 | __pycache__/ 36 | *.py[cod] 37 | *$py.class 38 | logs/ 39 | 40 | # C extensions 41 | *.so 42 | 43 | # Distribution / packaging 44 | .Python 45 | build/ 46 | develop-eggs/ 47 | dist/ 48 | downloads/ 49 | eggs/ 50 | .eggs/ 51 | lib/ 52 | lib64/ 53 | parts/ 54 | sdist/ 55 | var/ 56 | wheels/ 57 | *.egg-info/ 58 | .installed.cfg 59 | *.egg 60 | MANIFEST 61 | 62 | # PyInstaller 63 | # Usually these files are written by a python script from a template 64 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 65 | *.manifest 66 | *.spec 67 | 68 | # Installer logs 69 | pip-log.txt 70 | pip-delete-this-directory.txt 71 | 72 | # Unit test / coverage reports 73 | htmlcov/ 74 | .tox/ 75 | .nox/ 76 | .coverage 77 | .coverage.* 78 | .cache 79 | nosetests.xml 80 | coverage.xml 81 | *.cover 82 | .hypothesis/ 83 | .pytest_cache/ 84 | 85 | # Translations 86 | *.mo 87 | *.pot 88 | 89 | # Django stuff: 90 | *.log 91 | local_settings.py 92 | db.sqlite3 93 | 94 | # Flask stuff: 95 | instance/ 96 | .webassets-cache 97 | 98 | # Scrapy stuff: 99 | .scrapy 100 | 101 | # Sphinx documentation 102 | docs/_build/ 103 | 104 | # PyBuilder 105 | target/ 106 | 107 | # Jupyter Notebook 108 | .ipynb_checkpoints 109 | 110 | # IPython 111 | profile_default/ 112 | ipython_config.py 113 | 114 | # pyenv 115 | .python-version 116 | 117 | # celery beat schedule file 118 | celerybeat-schedule 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | ### Python Patch ### 151 | .venv/ 152 | 153 | ### Python.VirtualEnv Stack ### 154 | # Virtualenv 155 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 156 | [Bb]in 157 | [Ii]nclude 158 | [Ll]ib 159 | [Ll]ib64 160 | [Ll]ocal 161 | [Ss]cripts 162 | pyvenv.cfg 163 | pip-selfcheck.json 164 | 165 | # End of https://www.gitignore.io/api/macos,python 166 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Cheol Kang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PG/1-REINFORCE/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lr = 0.001 6 | goal_score = 200 7 | log_interval = 10 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | -------------------------------------------------------------------------------- /PG/1-REINFORCE/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 5 | 6 | class Memory(object): 7 | def __init__(self): 8 | self.memory = deque() 9 | 10 | def push(self, state, next_state, action, reward, mask): 11 | self.memory.append(Transition(state, next_state, action, reward, mask)) 12 | 13 | def sample(self): 14 | memory = self.memory 15 | return Transition(*zip(*memory)) 16 | 17 | def __len__(self): 18 | return len(self.memory) 19 | -------------------------------------------------------------------------------- /PG/1-REINFORCE/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from config import gamma 7 | class QNet(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | super(QNet, self).__init__() 10 | self.num_inputs = num_inputs 11 | self.num_outputs = num_outputs 12 | 13 | self.fc_1 = nn.Linear(num_inputs, 128) 14 | self.fc_2 = nn.Linear(128, num_outputs) 15 | 16 | for m in self.modules(): 17 | if isinstance(m, nn.Linear): 18 | nn.init.xavier_uniform(m.weight) 19 | 20 | def forward(self, input): 21 | x = F.relu(self.fc_1(input)) 22 | policy = F.softmax(self.fc_2(x)) 23 | return policy 24 | 25 | @classmethod 26 | def train_model(cls, net, transitions, optimizer): 27 | states, actions, rewards, masks = transitions.state, transitions.action, transitions.reward, transitions.mask 28 | 29 | states = torch.stack(states) 30 | actions = torch.stack(actions) 31 | rewards = torch.Tensor(rewards) 32 | masks = torch.Tensor(masks) 33 | 34 | returns = torch.zeros_like(rewards) 35 | 36 | running_return = 0 37 | for t in reversed(range(len(rewards))): 38 | running_return = rewards[t] + gamma * running_return * masks[t] 39 | returns[t] = running_return 40 | 41 | policies = net(states) 42 | policies = policies.view(-1, net.num_outputs) 43 | 44 | log_policies = (torch.log(policies) * actions.detach()).sum(dim=1) 45 | 46 | loss = (-log_policies * returns).sum() 47 | 48 | optimizer.zero_grad() 49 | loss.backward() 50 | optimizer.step() 51 | 52 | return loss 53 | 54 | def get_action(self, input): 55 | policy = self.forward(input) 56 | policy = policy[0].data.numpy() 57 | 58 | action = np.random.choice(self.num_outputs, 1, p=policy)[0] 59 | return action 60 | -------------------------------------------------------------------------------- /PG/1-REINFORCE/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import QNet 11 | from tensorboardX import SummaryWriter 12 | 13 | from memory import Memory 14 | from config import env_name, goal_score, log_interval, device, lr, gamma 15 | 16 | 17 | def main(): 18 | env = gym.make(env_name) 19 | env.seed(500) 20 | torch.manual_seed(500) 21 | 22 | num_inputs = env.observation_space.shape[0] 23 | num_actions = env.action_space.n 24 | print('state size:', num_inputs) 25 | print('action size:', num_actions) 26 | 27 | net = QNet(num_inputs, num_actions) 28 | 29 | optimizer = optim.Adam(net.parameters(), lr=lr) 30 | writer = SummaryWriter('logs') 31 | 32 | net.to(device) 33 | net.train() 34 | running_score = 0 35 | steps = 0 36 | loss = 0 37 | 38 | for e in range(3000): 39 | done = False 40 | memory = Memory() 41 | 42 | score = 0 43 | state = env.reset() 44 | state = torch.Tensor(state).to(device) 45 | state = state.unsqueeze(0) 46 | 47 | while not done: 48 | steps += 1 49 | 50 | action = net.get_action(state) 51 | next_state, reward, done, _ = env.step(action) 52 | 53 | next_state = torch.Tensor(next_state) 54 | next_state = next_state.unsqueeze(0) 55 | 56 | mask = 0 if done else 1 57 | reward = reward if not done or score == 499 else -1 58 | 59 | action_one_hot = torch.zeros(2) 60 | action_one_hot[action] = 1 61 | memory.push(state, next_state, action_one_hot, reward, mask) 62 | 63 | score += reward 64 | state = next_state 65 | 66 | loss = QNet.train_model(net, memory.sample(), optimizer) 67 | 68 | 69 | score = score if score == 500.0 else score + 1 70 | running_score = 0.99 * running_score + 0.01 * score 71 | if e % log_interval == 0: 72 | print('{} episode | score: {:.2f}'.format( 73 | e, running_score)) 74 | writer.add_scalar('log/score', float(running_score), e) 75 | writer.add_scalar('log/loss', float(loss), e) 76 | 77 | if running_score > goal_score: 78 | break 79 | 80 | 81 | if __name__=="__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /PG/2-Actor-Critic/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lr = 0.0001 6 | goal_score = 200 7 | log_interval = 10 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | -------------------------------------------------------------------------------- /PG/2-Actor-Critic/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from config import gamma 7 | class QNet(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | super(QNet, self).__init__() 10 | self.num_inputs = num_inputs 11 | self.num_outputs = num_outputs 12 | 13 | self.fc = nn.Linear(num_inputs, 128) 14 | self.fc_actor = nn.Linear(128, num_outputs) 15 | self.fc_critic = nn.Linear(128, num_outputs) 16 | 17 | for m in self.modules(): 18 | if isinstance(m, nn.Linear): 19 | nn.init.xavier_uniform(m.weight) 20 | 21 | def forward(self, input): 22 | x = F.relu(self.fc(input)) 23 | policy = F.softmax(self.fc_actor(x)) 24 | q_value = self.fc_critic(x) 25 | return policy, q_value 26 | 27 | @classmethod 28 | def train_model(cls, net, optimizer, transition): 29 | state, next_state, action, reward, mask = transition 30 | 31 | policy, q_value = net(state) 32 | policy, q_value = policy.view(-1, net.num_outputs), q_value.view(-1, net.num_outputs) 33 | _, next_q_value = net(next_state) 34 | next_q_value = next_q_value.view(-1, net.num_outputs) 35 | next_action = net.get_action(next_state) 36 | 37 | 38 | target = reward + mask * gamma * next_q_value[0][next_action] 39 | 40 | log_policy = torch.log(policy[0])[action] 41 | loss_policy = - log_policy * q_value[0][action].item() 42 | loss_value = F.mse_loss(q_value[0][action], target.detach()) 43 | 44 | loss = loss_policy + loss_value 45 | optimizer.zero_grad() 46 | loss.backward() 47 | optimizer.step() 48 | 49 | return loss 50 | 51 | def get_action(self, input): 52 | policy, _ = self.forward(input) 53 | policy = policy[0].data.numpy() 54 | 55 | action = np.random.choice(self.num_outputs, 1, p=policy)[0] 56 | return action 57 | -------------------------------------------------------------------------------- /PG/2-Actor-Critic/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import QNet 11 | from tensorboardX import SummaryWriter 12 | 13 | from config import env_name, goal_score, log_interval, device, lr 14 | 15 | 16 | def main(): 17 | env = gym.make(env_name) 18 | env.seed(500) 19 | torch.manual_seed(500) 20 | 21 | num_inputs = env.observation_space.shape[0] 22 | num_actions = env.action_space.n 23 | print('state size:', num_inputs) 24 | print('action size:', num_actions) 25 | 26 | net = QNet(num_inputs, num_actions) 27 | 28 | optimizer = optim.Adam(net.parameters(), lr=lr) 29 | writer = SummaryWriter('logs') 30 | 31 | net.to(device) 32 | net.train() 33 | running_score = 0 34 | steps = 0 35 | loss = 0 36 | 37 | for e in range(3000): 38 | done = False 39 | 40 | score = 0 41 | state = env.reset() 42 | state = torch.Tensor(state).to(device) 43 | state = state.unsqueeze(0) 44 | 45 | while not done: 46 | steps += 1 47 | 48 | action = net.get_action(state) 49 | next_state, reward, done, _ = env.step(action) 50 | 51 | next_state = torch.Tensor(next_state) 52 | next_state = next_state.unsqueeze(0) 53 | 54 | mask = 0 if done else 1 55 | reward = reward if not done or score == 499 else -1 56 | transition = [state, next_state, action, reward, mask] 57 | 58 | score += reward 59 | state = next_state 60 | 61 | loss = QNet.train_model(net, optimizer, transition) 62 | 63 | score = score if score == 500.0 else score + 1 64 | running_score = 0.99 * running_score + 0.01 * score 65 | if e % log_interval == 0: 66 | print('{} episode | score: {:.2f}'.format( 67 | e, running_score)) 68 | writer.add_scalar('log/score', float(running_score), e) 69 | writer.add_scalar('log/loss', float(loss), e) 70 | 71 | if running_score > goal_score: 72 | break 73 | 74 | 75 | if __name__=="__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /PG/3-Advantage-Actor-Critic/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lr = 0.001 6 | goal_score = 200 7 | log_interval = 10 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | -------------------------------------------------------------------------------- /PG/3-Advantage-Actor-Critic/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from config import gamma 7 | class QNet(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | super(QNet, self).__init__() 10 | self.num_inputs = num_inputs 11 | self.num_outputs = num_outputs 12 | 13 | self.fc = nn.Linear(num_inputs, 128) 14 | self.fc_actor = nn.Linear(128, num_outputs) 15 | self.fc_critic = nn.Linear(128, 1) 16 | 17 | for m in self.modules(): 18 | if isinstance(m, nn.Linear): 19 | nn.init.xavier_uniform(m.weight) 20 | 21 | def forward(self, input): 22 | x = F.relu(self.fc(input)) 23 | policy = F.softmax(self.fc_actor(x)) 24 | value = self.fc_critic(x) 25 | return policy, value 26 | 27 | @classmethod 28 | def train_model(cls, net, optimizer, transition): 29 | state, next_state, action, reward, mask = transition 30 | 31 | policy, value = net(state) 32 | policy, value = policy.view(-1, net.num_outputs), value.view(-1) 33 | _, next_value = net(next_state) 34 | next_value = next_value.view(-1) 35 | 36 | target = reward + mask * gamma * next_value[0] 37 | td_error = target - value[0] 38 | 39 | log_policy = torch.log(policy[0])[action] 40 | loss_policy = - log_policy * td_error.item() 41 | loss_value = F.mse_loss(value[0], target.detach()) 42 | entropy = torch.log(policy[0]) * policy[0] 43 | 44 | loss = loss_policy + loss_value - 0.1 * entropy.sum() 45 | optimizer.zero_grad() 46 | loss.backward() 47 | optimizer.step() 48 | 49 | return loss 50 | 51 | def get_action(self, input): 52 | policy, _ = self.forward(input) 53 | policy = policy[0].data.numpy() 54 | 55 | action = np.random.choice(self.num_outputs, 1, p=policy)[0] 56 | return action 57 | -------------------------------------------------------------------------------- /PG/3-Advantage-Actor-Critic/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import QNet 11 | from tensorboardX import SummaryWriter 12 | 13 | from config import env_name, goal_score, log_interval, device, lr 14 | 15 | 16 | def main(): 17 | env = gym.make(env_name) 18 | env.seed(500) 19 | torch.manual_seed(500) 20 | 21 | num_inputs = env.observation_space.shape[0] 22 | num_actions = env.action_space.n 23 | print('state size:', num_inputs) 24 | print('action size:', num_actions) 25 | 26 | net = QNet(num_inputs, num_actions) 27 | 28 | optimizer = optim.Adam(net.parameters(), lr=lr) 29 | writer = SummaryWriter('logs') 30 | 31 | net.to(device) 32 | net.train() 33 | running_score = 0 34 | steps = 0 35 | loss = 0 36 | 37 | for e in range(3000): 38 | done = False 39 | 40 | score = 0 41 | state = env.reset() 42 | state = torch.Tensor(state).to(device) 43 | state = state.unsqueeze(0) 44 | 45 | while not done: 46 | steps += 1 47 | 48 | action = net.get_action(state) 49 | next_state, reward, done, _ = env.step(action) 50 | 51 | next_state = torch.Tensor(next_state) 52 | next_state = next_state.unsqueeze(0) 53 | 54 | mask = 0 if done else 1 55 | reward = reward if not done or score == 499 else -1 56 | transition = [state, next_state, action, reward, mask] 57 | 58 | score += reward 59 | state = next_state 60 | 61 | loss = QNet.train_model(net, optimizer, transition) 62 | 63 | score = score if score == 500.0 else score + 1 64 | running_score = 0.99 * running_score + 0.01 * score 65 | if e % log_interval == 0: 66 | print('{} episode | score: {:.2f}'.format( 67 | e, running_score)) 68 | writer.add_scalar('log/score', float(running_score), e) 69 | writer.add_scalar('log/loss', float(loss), e) 70 | 71 | if running_score > goal_score: 72 | break 73 | 74 | 75 | if __name__=="__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /PG/4-GAE/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lambda_gae = 0.96 6 | lr = 0.0001 7 | goal_score = 200 8 | log_interval = 10 9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 10 | 11 | ciritic_coefficient = 0.5 12 | entropy_coefficient = 0.01 -------------------------------------------------------------------------------- /PG/4-GAE/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 5 | 6 | class Memory(object): 7 | def __init__(self): 8 | self.memory = deque() 9 | 10 | def push(self, state, next_state, action, reward, mask): 11 | self.memory.append(Transition(state, next_state, action, reward, mask)) 12 | 13 | def sample(self): 14 | memory = self.memory 15 | return Transition(*zip(*memory)) 16 | 17 | def __len__(self): 18 | return len(self.memory) 19 | -------------------------------------------------------------------------------- /PG/4-GAE/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from config import gamma, lambda_gae, ciritic_coefficient, entropy_coefficient 7 | 8 | from collections import namedtuple 9 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask', 'value', 'return_value', 'advantage')) 10 | 11 | class GAE(nn.Module): 12 | def __init__(self, num_inputs, num_outputs): 13 | super(GAE, self).__init__() 14 | self.num_inputs = num_inputs 15 | self.num_outputs = num_outputs 16 | 17 | self.fc = nn.Linear(num_inputs, 128) 18 | self.fc_actor = nn.Linear(128, num_outputs) 19 | self.fc_critic = nn.Linear(128, 1) 20 | 21 | for m in self.modules(): 22 | if isinstance(m, nn.Linear): 23 | nn.init.xavier_uniform(m.weight) 24 | 25 | def forward(self, input): 26 | x = F.relu(self.fc(input)) 27 | policy = F.softmax(self.fc_actor(x)) 28 | value = self.fc_critic(x) 29 | return policy, value 30 | 31 | @classmethod 32 | def get_gae(self, values, rewards, masks): 33 | returns = torch.zeros_like(rewards) 34 | advantages = torch.zeros_like(rewards) 35 | 36 | running_return = 0 37 | previous_value = 0 38 | running_advantage = 0 39 | 40 | for t in reversed(range(len(rewards))): 41 | running_return = rewards[t] + gamma * running_return * masks[t] 42 | running_tderror = rewards[t] + gamma * previous_value * masks[t] - values.data[t] 43 | running_advantage = running_tderror + (gamma * lambda_gae) * running_advantage * masks[t] 44 | 45 | returns[t] = running_return 46 | previous_value = values.data[t] 47 | advantages[t] = running_advantage 48 | 49 | return returns, advantages 50 | 51 | @classmethod 52 | def train_model(cls, net, transitions, optimizer): 53 | states, actions, rewards, masks = transitions.state, transitions.action, transitions.reward, transitions.mask 54 | 55 | states = torch.stack(states) 56 | actions = torch.stack(actions) 57 | rewards = torch.Tensor(rewards) 58 | masks = torch.Tensor(masks) 59 | 60 | policies, values = net(states) 61 | policies = policies.view(-1, net.num_outputs) 62 | values = values.view(-1) 63 | 64 | returns, advantages = net.get_gae(values.view(-1).detach(), rewards, masks) 65 | 66 | log_policies = (torch.log(policies) * actions.detach()).sum(dim=1) 67 | actor_loss = -(log_policies * advantages).sum() 68 | critic_loss = (returns.detach() - values).pow(2).sum() 69 | 70 | entropy = (torch.log(policies) * policies).sum(1).sum() 71 | 72 | loss = actor_loss + ciritic_coefficient * critic_loss - entropy_coefficient * entropy 73 | optimizer.zero_grad() 74 | loss.backward() 75 | optimizer.step() 76 | 77 | return loss 78 | 79 | def get_action(self, input): 80 | policy, _ = self.forward(input) 81 | policy = policy[0].data.numpy() 82 | 83 | action = np.random.choice(self.num_outputs, 1, p=policy)[0] 84 | return action 85 | -------------------------------------------------------------------------------- /PG/4-GAE/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import GAE 11 | from tensorboardX import SummaryWriter 12 | 13 | from memory import Memory 14 | from config import env_name, goal_score, log_interval, device, lr, gamma 15 | 16 | 17 | def main(): 18 | env = gym.make(env_name) 19 | env.seed(500) 20 | torch.manual_seed(500) 21 | 22 | num_inputs = env.observation_space.shape[0] 23 | num_actions = env.action_space.n 24 | print('state size:', num_inputs) 25 | print('action size:', num_actions) 26 | 27 | net = GAE(num_inputs, num_actions) 28 | 29 | optimizer = optim.Adam(net.parameters(), lr=lr) 30 | writer = SummaryWriter('logs') 31 | 32 | net.to(device) 33 | net.train() 34 | running_score = 0 35 | steps = 0 36 | loss = 0 37 | 38 | for e in range(30000): 39 | done = False 40 | memory = Memory() 41 | 42 | score = 0 43 | state = env.reset() 44 | state = torch.Tensor(state).to(device) 45 | state = state.unsqueeze(0) 46 | 47 | while not done: 48 | steps += 1 49 | 50 | action = net.get_action(state) 51 | next_state, reward, done, _ = env.step(action) 52 | 53 | next_state = torch.Tensor(next_state) 54 | next_state = next_state.unsqueeze(0) 55 | 56 | mask = 0 if done else 1 57 | reward = reward if not done or score == 499 else -1 58 | 59 | action_one_hot = torch.zeros(2) 60 | action_one_hot[action] = 1 61 | memory.push(state, next_state, action_one_hot, reward, mask) 62 | 63 | score += reward 64 | state = next_state 65 | 66 | loss = GAE.train_model(net, memory.sample(), optimizer) 67 | 68 | score = score if score == 500.0 else score + 1 69 | running_score = 0.99 * running_score + 0.01 * score 70 | if e % log_interval == 0: 71 | print('{} episode | score: {:.2f}'.format( 72 | e, running_score)) 73 | writer.add_scalar('log/score', float(running_score), e) 74 | writer.add_scalar('log/loss', float(loss), e) 75 | 76 | if running_score > goal_score: 77 | break 78 | 79 | 80 | if __name__=="__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /PG/5-TNPG/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lr = 0.001 6 | goal_score = 200 7 | log_interval = 10 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | -------------------------------------------------------------------------------- /PG/5-TNPG/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 5 | 6 | class Memory(object): 7 | def __init__(self): 8 | self.memory = deque() 9 | 10 | def push(self, state, next_state, action, reward, mask): 11 | self.memory.append(Transition(state, next_state, action, reward, mask)) 12 | 13 | def sample(self): 14 | memory = self.memory 15 | return Transition(*zip(*memory)) 16 | 17 | def __len__(self): 18 | return len(self.memory) 19 | -------------------------------------------------------------------------------- /PG/5-TNPG/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import TNPG 11 | from tensorboardX import SummaryWriter 12 | 13 | from memory import Memory 14 | from config import env_name, goal_score, log_interval, device, lr, gamma 15 | 16 | 17 | def main(): 18 | env = gym.make(env_name) 19 | env.seed(500) 20 | torch.manual_seed(500) 21 | 22 | num_inputs = env.observation_space.shape[0] 23 | num_actions = env.action_space.n 24 | print('state size:', num_inputs) 25 | print('action size:', num_actions) 26 | 27 | net = TNPG(num_inputs, num_actions) 28 | writer = SummaryWriter('logs') 29 | 30 | net.to(device) 31 | net.train() 32 | running_score = 0 33 | steps = 0 34 | loss = 0 35 | for e in range(30000): 36 | done = False 37 | memory = Memory() 38 | 39 | score = 0 40 | state = env.reset() 41 | state = torch.Tensor(state).to(device) 42 | state = state.unsqueeze(0) 43 | 44 | while not done: 45 | steps += 1 46 | 47 | action = net.get_action(state) 48 | next_state, reward, done, _ = env.step(action) 49 | 50 | next_state = torch.Tensor(next_state) 51 | next_state = next_state.unsqueeze(0) 52 | 53 | mask = 0 if done else 1 54 | reward = reward if not done or score == 499 else -1 55 | 56 | action_one_hot = torch.zeros(2) 57 | action_one_hot[action] = 1 58 | memory.push(state, next_state, action_one_hot, reward, mask) 59 | 60 | score += reward 61 | state = next_state 62 | 63 | loss = TNPG.train_model(net, memory.sample()) 64 | 65 | score = score if score == 500.0 else score + 1 66 | running_score = 0.99 * running_score + 0.01 * score 67 | if e % log_interval == 0: 68 | print('{} episode | score: {:.2f}'.format( 69 | e, running_score)) 70 | writer.add_scalar('log/score', float(running_score), e) 71 | writer.add_scalar('log/loss', float(loss), e) 72 | 73 | if running_score > goal_score: 74 | break 75 | 76 | 77 | if __name__=="__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /PG/6-TRPO/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | goal_score = 200 6 | log_interval = 10 7 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 8 | 9 | max_kl = 0.01 10 | -------------------------------------------------------------------------------- /PG/6-TRPO/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 5 | 6 | class Memory(object): 7 | def __init__(self): 8 | self.memory = deque() 9 | 10 | def push(self, state, next_state, action, reward, mask): 11 | self.memory.append(Transition(state, next_state, action, reward, mask)) 12 | 13 | def sample(self): 14 | memory = self.memory 15 | return Transition(*zip(*memory)) 16 | 17 | def __len__(self): 18 | return len(self.memory) 19 | -------------------------------------------------------------------------------- /PG/6-TRPO/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import TRPO 11 | from tensorboardX import SummaryWriter 12 | 13 | from memory import Memory 14 | from config import env_name, goal_score, log_interval, device, gamma 15 | 16 | 17 | def main(): 18 | env = gym.make(env_name) 19 | env.seed(500) 20 | torch.manual_seed(500) 21 | 22 | num_inputs = env.observation_space.shape[0] 23 | num_actions = env.action_space.n 24 | print('state size:', num_inputs) 25 | print('action size:', num_actions) 26 | 27 | net = TRPO(num_inputs, num_actions) 28 | writer = SummaryWriter('logs') 29 | 30 | net.to(device) 31 | net.train() 32 | running_score = 0 33 | steps = 0 34 | loss = 0 35 | for e in range(30000): 36 | done = False 37 | memory = Memory() 38 | 39 | score = 0 40 | state = env.reset() 41 | state = torch.Tensor(state).to(device) 42 | state = state.unsqueeze(0) 43 | 44 | while not done: 45 | steps += 1 46 | 47 | action = net.get_action(state) 48 | next_state, reward, done, _ = env.step(action) 49 | 50 | next_state = torch.Tensor(next_state) 51 | next_state = next_state.unsqueeze(0) 52 | 53 | mask = 0 if done else 1 54 | reward = reward if not done or score == 499 else -1 55 | 56 | action_one_hot = torch.zeros(2) 57 | action_one_hot[action] = 1 58 | memory.push(state, next_state, action_one_hot, reward, mask) 59 | 60 | score += reward 61 | state = next_state 62 | 63 | loss = TRPO.train_model(net, memory.sample()) 64 | 65 | score = score if score == 500.0 else score + 1 66 | running_score = 0.99 * running_score + 0.01 * score 67 | if e % log_interval == 0: 68 | print('{} episode | score: {:.2f}'.format( 69 | e, running_score)) 70 | writer.add_scalar('log/score', float(running_score), e) 71 | writer.add_scalar('log/loss', float(loss), e) 72 | 73 | if running_score > goal_score: 74 | break 75 | 76 | 77 | if __name__=="__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /PG/7-PPO/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lr = 0.001 6 | goal_score = 200 7 | log_interval = 10 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | 10 | lambda_gae = 0.96 11 | epsilon_clip = 0.2 12 | ciritic_coefficient = 0.5 13 | entropy_coefficient = 0.01 14 | batch_size = 8 15 | epoch_k = 10 16 | -------------------------------------------------------------------------------- /PG/7-PPO/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | from config import batch_size 4 | 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 6 | 7 | class Memory(object): 8 | def __init__(self): 9 | self.memory = deque() 10 | 11 | def push(self, state, next_state, action, reward, mask): 12 | self.memory.append(Transition(state, next_state, action, reward, mask)) 13 | 14 | def sample(self): 15 | memory = self.memory 16 | return Transition(*zip(*memory)) 17 | 18 | def __len__(self): 19 | return len(self.memory) 20 | 21 | class BatchMaker(): 22 | def __init__(self, states, actions, returns, advantages, old_policies): 23 | self.states = states 24 | self.actions = actions 25 | self.returns = returns 26 | self.advantages = advantages 27 | self.old_policies = old_policies 28 | 29 | def sample(self): 30 | sample_indexes = random.sample(range(len(self.states)), batch_size) 31 | states_sample = self.states[sample_indexes] 32 | actions_sample = self.actions[sample_indexes] 33 | retruns_sample = self.returns[sample_indexes] 34 | advantages_sample = self.advantages[sample_indexes] 35 | old_policies_sample = self.old_policies[sample_indexes] 36 | 37 | return states_sample, actions_sample, retruns_sample, advantages_sample, old_policies_sample 38 | 39 | -------------------------------------------------------------------------------- /PG/7-PPO/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import random 6 | 7 | from memory import BatchMaker 8 | from config import gamma, lambda_gae, epsilon_clip, ciritic_coefficient, entropy_coefficient, epoch_k, batch_size 9 | 10 | import warnings 11 | 12 | 13 | class PPO(nn.Module): 14 | def __init__(self, num_inputs, num_outputs): 15 | super(PPO, self).__init__() 16 | self.t = 0 17 | self.num_inputs = num_inputs 18 | self.num_outputs = num_outputs 19 | 20 | self.fc = nn.Linear(num_inputs, 128) 21 | self.fc_actor = nn.Linear(128, num_outputs) 22 | self.fc_critic = nn.Linear(128, 1) 23 | 24 | for m in self.modules(): 25 | if isinstance(m, nn.Linear): 26 | nn.init.xavier_uniform(m.weight) 27 | 28 | def forward(self, input): 29 | x = torch.relu(self.fc(input)) 30 | policy = F.softmax(self.fc_actor(x), dim=-1) 31 | value = self.fc_critic(x) 32 | return policy, value 33 | 34 | @classmethod 35 | def get_gae(self, values, rewards, masks): 36 | returns = torch.zeros_like(rewards) 37 | advantages = torch.zeros_like(rewards) 38 | 39 | running_return = 0 40 | previous_value = 0 41 | running_advantage = 0 42 | 43 | for t in reversed(range(len(rewards))): 44 | running_return = rewards[t] + gamma * running_return * masks[t] 45 | running_tderror = rewards[t] + gamma * previous_value * masks[t] - values.data[t] 46 | running_advantage = running_tderror + (gamma * lambda_gae) * running_advantage * masks[t] 47 | 48 | returns[t] = running_return 49 | previous_value = values.data[t] 50 | advantages[t] = running_advantage 51 | 52 | return returns, advantages 53 | 54 | @classmethod 55 | def train_model(cls, net, transitions, optimizer): 56 | states, actions, rewards, masks = transitions.state, transitions.action, transitions.reward, transitions.mask 57 | 58 | states = torch.stack(states) 59 | actions = torch.stack(actions) 60 | rewards = torch.Tensor(rewards) 61 | masks = torch.Tensor(masks) 62 | 63 | old_policies, old_values = net(states) 64 | old_policies = old_policies.view(-1, net.num_outputs).detach() 65 | returns, advantages = net.get_gae(old_values.view(-1).detach(), rewards, masks) 66 | 67 | batch_maker = BatchMaker(states, actions, returns, advantages, old_policies) 68 | for _ in range(epoch_k): 69 | for _ in range(len(states) // batch_size): 70 | states_sample, actions_sample, returns_sample, advantages_sample, old_policies_sample = batch_maker.sample() 71 | 72 | policies, values = net(states_sample) 73 | values = values.view(-1) 74 | policies = policies.view(-1, net.num_outputs) 75 | 76 | ratios = ((policies / old_policies_sample) * actions_sample.detach()).sum(dim=1) 77 | 78 | 79 | clipped_ratios = torch.clamp(ratios, min=1.0-epsilon_clip, max=1.0+epsilon_clip) 80 | 81 | actor_loss = -torch.min(ratios * advantages_sample, 82 | clipped_ratios * advantages_sample).sum() 83 | 84 | critic_loss = (returns_sample.detach() - values).pow(2).sum() 85 | 86 | policy_entropy = (torch.log(policies) * policies).sum(1, keepdim=True).mean() 87 | 88 | loss = actor_loss + ciritic_coefficient * critic_loss - entropy_coefficient * policy_entropy 89 | 90 | optimizer.zero_grad() 91 | loss.backward() 92 | optimizer.step() 93 | 94 | return loss 95 | 96 | def get_action(self, input): 97 | policy, _ = self.forward(input) 98 | 99 | policy = policy[0].data.numpy() 100 | action = np.random.choice(self.num_outputs, 1, p=policy)[0] 101 | 102 | return action 103 | -------------------------------------------------------------------------------- /PG/7-PPO/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import PPO 11 | from tensorboardX import SummaryWriter 12 | 13 | from memory import Memory 14 | from config import env_name, goal_score, log_interval, device, gamma, lr 15 | 16 | 17 | def main(): 18 | env = gym.make(env_name) 19 | env.seed(500) 20 | torch.manual_seed(500) 21 | 22 | num_inputs = env.observation_space.shape[0] 23 | num_actions = env.action_space.n 24 | print('state size:', num_inputs) 25 | print('action size:', num_actions) 26 | 27 | net = PPO(num_inputs, num_actions) 28 | 29 | optimizer = optim.Adam(net.parameters(), lr=lr) 30 | writer = SummaryWriter('logs') 31 | 32 | net.to(device) 33 | net.train() 34 | running_score = 0 35 | steps = 0 36 | loss = 0 37 | 38 | for e in range(30000): 39 | done = False 40 | memory = Memory() 41 | 42 | score = 0 43 | state = env.reset() 44 | state = torch.Tensor(state).to(device) 45 | state = state.unsqueeze(0) 46 | 47 | while not done: 48 | steps += 1 49 | 50 | action = net.get_action(state) 51 | next_state, reward, done, _ = env.step(action) 52 | 53 | next_state = torch.Tensor(next_state) 54 | next_state = next_state.unsqueeze(0) 55 | 56 | mask = 0 if done else 1 57 | reward = reward if not done or score == 499 else -1 58 | 59 | action_one_hot = torch.zeros(2) 60 | action_one_hot[action] = 1 61 | memory.push(state, next_state, action_one_hot, reward, mask) 62 | 63 | score += reward 64 | state = next_state 65 | 66 | loss = PPO.train_model(net, memory.sample(), optimizer) 67 | 68 | score = score if score == 500.0 else score + 1 69 | if running_score == 0: 70 | running_score = score 71 | running_score = 0.99 * running_score + 0.01 * score 72 | if e % log_interval == 0: 73 | print('{} episode | score: {:.2f}'.format( 74 | e, running_score)) 75 | writer.add_scalar('log/score', float(running_score), e) 76 | writer.add_scalar('log/loss', float(loss), e) 77 | 78 | if running_score > goal_score: 79 | break 80 | 81 | 82 | if __name__=="__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /POMDP/0-DQN/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.0001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | sequence_length = 4 -------------------------------------------------------------------------------- /POMDP/0-DQN/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | import torch 4 | 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 6 | 7 | 8 | class Memory(object): 9 | def __init__(self, capacity): 10 | self.memory = deque(maxlen=capacity) 11 | self.capacity = capacity 12 | 13 | def push(self, state, next_state, action, reward, mask): 14 | self.memory.append(Transition(torch.stack(list(state)), torch.stack(list(next_state)), action, reward, mask)) 15 | 16 | def sample(self, batch_size): 17 | transitions = random.sample(self.memory, batch_size) 18 | batch = Transition(*zip(*transitions)) 19 | return batch 20 | 21 | def __len__(self): 22 | return len(self.memory) 23 | -------------------------------------------------------------------------------- /POMDP/0-DQN/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from config import gamma, sequence_length 6 | class QNet(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(QNet, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc1 = nn.Linear(num_inputs * sequence_length, 128) 13 | self.fc2 = nn.Linear(128, num_outputs) 14 | 15 | for m in self.modules(): 16 | if isinstance(m, nn.Linear): 17 | nn.init.xavier_uniform(m.weight) 18 | 19 | def forward(self, x): 20 | x = x.view(-1, self.num_inputs * sequence_length) 21 | x = F.relu(self.fc1(x)) 22 | qvalue = self.fc2(x) 23 | return qvalue 24 | 25 | @classmethod 26 | def train_model(cls, online_net, target_net, optimizer, batch): 27 | states = torch.stack(batch.state) 28 | next_states = torch.stack(batch.next_state) 29 | actions = torch.Tensor(batch.action).float() 30 | rewards = torch.Tensor(batch.reward) 31 | masks = torch.Tensor(batch.mask) 32 | 33 | pred = online_net(states) 34 | next_pred = target_net(next_states) 35 | 36 | pred = torch.sum(pred.mul(actions), dim=1) 37 | 38 | target = rewards + masks * gamma * next_pred.max(1)[0] 39 | 40 | loss = F.mse_loss(pred, target.detach()) 41 | optimizer.zero_grad() 42 | loss.backward() 43 | optimizer.step() 44 | 45 | return loss 46 | 47 | def get_action(self, input): 48 | qvalue = self.forward(input) 49 | _, action = torch.max(qvalue, 1) 50 | return action.numpy()[0] 51 | -------------------------------------------------------------------------------- /POMDP/0-DQN/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import QNet 11 | from memory import Memory 12 | from tensorboardX import SummaryWriter 13 | 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length 15 | from collections import deque 16 | 17 | def get_action(state_series, target_net, epsilon, env): 18 | if np.random.rand() <= epsilon or len(state_series) < sequence_length: 19 | return env.action_space.sample() 20 | else: 21 | return target_net.get_action(torch.stack(list(state_series))) 22 | 23 | def update_target_model(online_net, target_net): 24 | # Target <- Net 25 | target_net.load_state_dict(online_net.state_dict()) 26 | 27 | def state_to_partial_observability(state): 28 | state = state[[0, 2]] 29 | return state 30 | 31 | def main(): 32 | env = gym.make(env_name) 33 | env.seed(500) 34 | torch.manual_seed(500) 35 | 36 | num_inputs = 2 37 | num_actions = env.action_space.n 38 | print('state size:', num_inputs) 39 | print('action size:', num_actions) 40 | 41 | online_net = QNet(num_inputs, num_actions) 42 | target_net = QNet(num_inputs, num_actions) 43 | update_target_model(online_net, target_net) 44 | 45 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 46 | writer = SummaryWriter('logs') 47 | 48 | online_net.to(device) 49 | target_net.to(device) 50 | online_net.train() 51 | target_net.train() 52 | memory = Memory(replay_memory_capacity) 53 | running_score = 0 54 | epsilon = 1.0 55 | steps = 0 56 | loss = 0 57 | 58 | for e in range(30000): 59 | done = False 60 | 61 | state_series = deque(maxlen=sequence_length) 62 | next_state_series = deque(maxlen=sequence_length) 63 | score = 0 64 | state = env.reset() 65 | 66 | state = state_to_partial_observability(state) 67 | state = torch.Tensor(state).to(device) 68 | 69 | next_state_series.append(state) 70 | while not done: 71 | steps += 1 72 | state_series.append(state) 73 | action = get_action(state_series, target_net, epsilon, env) 74 | next_state, reward, done, _ = env.step(action) 75 | 76 | next_state = state_to_partial_observability(next_state) 77 | next_state = torch.Tensor(next_state) 78 | 79 | mask = 0 if done else 1 80 | reward = reward if not done or score == 499 else -1 81 | action_one_hot = np.zeros(2) 82 | action_one_hot[action] = 1 83 | if len(state_series) >= sequence_length: 84 | memory.push(state_series, next_state_series, action_one_hot, reward, mask) 85 | 86 | score += reward 87 | state = next_state 88 | 89 | if steps > initial_exploration: 90 | epsilon -= 0.000005 91 | epsilon = max(epsilon, 0.1) 92 | 93 | batch = memory.sample(batch_size) 94 | loss = QNet.train_model(online_net, target_net, optimizer, batch) 95 | 96 | if steps % update_target == 0: 97 | update_target_model(online_net, target_net) 98 | 99 | score = score if score == 500.0 else score + 1 100 | if running_score == 0: 101 | running_score = score 102 | else: 103 | running_score = 0.99 * running_score + 0.01 * score 104 | if e % log_interval == 0: 105 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 106 | e, running_score, epsilon)) 107 | writer.add_scalar('log/score', float(running_score), e) 108 | writer.add_scalar('log/loss', float(loss), e) 109 | 110 | if running_score > goal_score: 111 | break 112 | 113 | 114 | if __name__=="__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /POMDP/1-DRQN/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 100 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | sequence_length = 8 15 | burn_in_length = 4 -------------------------------------------------------------------------------- /POMDP/1-DRQN/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | from config import sequence_length 4 | import numpy as np 5 | import torch 6 | 7 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 8 | 9 | class Memory(object): 10 | def __init__(self, capacity): 11 | self.memory = deque(maxlen=capacity) 12 | self.local_memory = [] 13 | self.capacity = capacity 14 | 15 | def push(self, state, next_state, action, reward, mask): 16 | self.local_memory.append(Transition(state, next_state, action, reward, mask)) 17 | if mask == 0: 18 | while len(self.local_memory) < sequence_length: 19 | self.local_memory.insert(0, Transition( 20 | torch.Tensor([0, 0]), 21 | torch.Tensor([0, 0]), 22 | 0, 23 | 0, 24 | 0, 25 | )) 26 | self.memory.append(self.local_memory) 27 | self.local_memory = [] 28 | 29 | def sample(self, batch_size): 30 | batch_state, batch_next_state, batch_action, batch_reward, batch_mask = [], [], [], [], [] 31 | p = np.array([len(episode) for episode in self.memory]) 32 | p = p / p.sum() 33 | 34 | batch_indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p) 35 | 36 | for batch_idx in batch_indexes: 37 | episode = self.memory[batch_idx] 38 | 39 | start = random.randint(0, len(episode) - sequence_length) 40 | transitions = episode[start:start + sequence_length] 41 | batch = Transition(*zip(*transitions)) 42 | 43 | batch_state.append(torch.stack(list(batch.state))) 44 | batch_next_state.append(torch.stack(list(batch.next_state))) 45 | batch_action.append(torch.Tensor(list(batch.action))) 46 | batch_reward.append(torch.Tensor(list(batch.reward))) 47 | batch_mask.append(torch.Tensor(list(batch.mask))) 48 | 49 | return Transition(batch_state, batch_next_state, batch_action, batch_reward, batch_mask) 50 | 51 | def __len__(self): 52 | return len(self.memory) -------------------------------------------------------------------------------- /POMDP/1-DRQN/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from config import gamma, device, batch_size, sequence_length, burn_in_length 6 | 7 | class DRQN(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | super(DRQN, self).__init__() 10 | self.num_inputs = num_inputs 11 | self.num_outputs = num_outputs 12 | 13 | self.lstm = nn.LSTM(input_size=num_inputs, hidden_size=128, batch_first=True) 14 | self.fc1 = nn.Linear(128, 256) 15 | self.fc2 = nn.Linear(256, num_outputs) 16 | 17 | for m in self.modules(): 18 | if isinstance(m, nn.Linear): 19 | nn.init.xavier_uniform(m.weight) 20 | 21 | def forward(self, x, hidden=None): 22 | # x [batch_size, sequence_length, num_inputs] 23 | 24 | if hidden is not None: 25 | out, hidden = self.lstm(x, hidden) 26 | else: 27 | out, hidden = self.lstm(x) 28 | out = F.relu(self.fc1(out)) 29 | qvalue = self.fc2(out) 30 | 31 | return qvalue, hidden 32 | 33 | 34 | @classmethod 35 | def train_model(cls, online_net, target_net, optimizer, batch): 36 | def slice_burn_in(item): 37 | return item[:, burn_in_length:, :] 38 | states = torch.stack(batch.state).view(batch_size, sequence_length, online_net.num_inputs) 39 | next_states = torch.stack(batch.next_state).view(batch_size, sequence_length, online_net.num_inputs) 40 | actions = torch.stack(batch.action).view(batch_size, sequence_length, -1).long() 41 | rewards = torch.stack(batch.reward).view(batch_size, sequence_length, -1) 42 | masks = torch.stack(batch.mask).view(batch_size, sequence_length, -1) 43 | 44 | pred, _ = online_net(states) 45 | next_pred, _ = target_net(next_states) 46 | 47 | pred = slice_burn_in(pred) 48 | next_pred = slice_burn_in(next_pred) 49 | actions = slice_burn_in(actions) 50 | rewards = slice_burn_in(rewards) 51 | masks = slice_burn_in(masks) 52 | 53 | pred = pred.gather(2, actions) 54 | 55 | target = rewards + masks * gamma * next_pred.max(2, keepdim=True)[0] 56 | 57 | loss = F.mse_loss(pred, target.detach()) 58 | optimizer.zero_grad() 59 | loss.backward() 60 | optimizer.step() 61 | 62 | return loss 63 | 64 | def get_action(self, state, hidden): 65 | state = state.unsqueeze(0).unsqueeze(0) 66 | 67 | qvalue, hidden = self.forward(state, hidden) 68 | 69 | _, action = torch.max(qvalue, 2) 70 | 71 | return action.numpy()[0][0], hidden 72 | -------------------------------------------------------------------------------- /POMDP/1-DRQN/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import DRQN 11 | from memory import Memory 12 | from tensorboardX import SummaryWriter 13 | 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length 15 | 16 | from collections import deque 17 | 18 | def get_action(state, target_net, epsilon, env, hidden): 19 | action, hidden = target_net.get_action(state, hidden) 20 | 21 | if np.random.rand() <= epsilon: 22 | return env.action_space.sample(), hidden 23 | else: 24 | return action, hidden 25 | 26 | def update_target_model(online_net, target_net): 27 | # Target <- Net 28 | target_net.load_state_dict(online_net.state_dict()) 29 | 30 | def state_to_partial_observability(state): 31 | state = state[[0, 2]] 32 | return state 33 | 34 | def main(): 35 | env = gym.make(env_name) 36 | env.seed(500) 37 | torch.manual_seed(500) 38 | 39 | # num_inputs = env.observation_space.shape[0] 40 | num_inputs = 2 41 | num_actions = env.action_space.n 42 | print('state size:', num_inputs) 43 | print('action size:', num_actions) 44 | 45 | online_net = DRQN(num_inputs, num_actions) 46 | target_net = DRQN(num_inputs, num_actions) 47 | update_target_model(online_net, target_net) 48 | 49 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 50 | writer = SummaryWriter('logs') 51 | 52 | online_net.to(device) 53 | target_net.to(device) 54 | online_net.train() 55 | target_net.train() 56 | memory = Memory(replay_memory_capacity) 57 | running_score = 0 58 | epsilon = 1.0 59 | steps = 0 60 | loss = 0 61 | 62 | for e in range(30000): 63 | done = False 64 | 65 | score = 0 66 | state = env.reset() 67 | state = state_to_partial_observability(state) 68 | state = torch.Tensor(state).to(device) 69 | 70 | hidden = None 71 | 72 | while not done: 73 | steps += 1 74 | 75 | action, hidden = get_action(state, target_net, epsilon, env, hidden) 76 | next_state, reward, done, _ = env.step(action) 77 | 78 | next_state = state_to_partial_observability(next_state) 79 | next_state = torch.Tensor(next_state) 80 | 81 | mask = 0 if done else 1 82 | reward = reward if not done or score == 499 else -1 83 | 84 | memory.push(state, next_state, action, reward, mask) 85 | 86 | score += reward 87 | state = next_state 88 | 89 | 90 | if steps > initial_exploration and len(memory) > batch_size: 91 | epsilon -= 0.00005 92 | epsilon = max(epsilon, 0.1) 93 | 94 | batch = memory.sample(batch_size) 95 | loss = DRQN.train_model(online_net, target_net, optimizer, batch) 96 | 97 | if steps % update_target == 0: 98 | update_target_model(online_net, target_net) 99 | 100 | score = score if score == 500.0 else score + 1 101 | if running_score == 0: 102 | running_score = score 103 | else: 104 | running_score = 0.99 * running_score + 0.01 * score 105 | if e % log_interval == 0: 106 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 107 | e, running_score, epsilon)) 108 | writer.add_scalar('log/score', float(running_score), e) 109 | writer.add_scalar('log/loss', float(loss), e) 110 | 111 | if running_score > goal_score: 112 | break 113 | 114 | 115 | if __name__=="__main__": 116 | main() 117 | -------------------------------------------------------------------------------- /POMDP/2-DRQN-Stack/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 100 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | sequence_length = 8 15 | burn_in_length = 4 -------------------------------------------------------------------------------- /POMDP/2-DRQN-Stack/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | from config import sequence_length 4 | import numpy as np 5 | import torch 6 | 7 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 8 | 9 | class Memory(object): 10 | def __init__(self, capacity): 11 | self.memory = deque(maxlen=capacity) 12 | self.local_memory = [] 13 | self.capacity = capacity 14 | 15 | def push(self, state, next_state, action, reward, mask): 16 | self.local_memory.append(Transition(state, next_state, action, reward, mask)) 17 | if mask == 0: 18 | while len(self.local_memory) < sequence_length: 19 | self.local_memory.insert(0, Transition( 20 | torch.Tensor([0, 0]), 21 | torch.Tensor([0, 0]), 22 | 0, 23 | 0, 24 | 0, 25 | )) 26 | self.memory.append(self.local_memory) 27 | self.local_memory = [] 28 | 29 | def sample(self, batch_size): 30 | batch_state, batch_next_state, batch_action, batch_reward, batch_mask = [], [], [], [], [] 31 | p = np.array([len(episode) for episode in self.memory]) 32 | p = p / p.sum() 33 | 34 | batch_indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p) 35 | 36 | for batch_idx in batch_indexes: 37 | episode = self.memory[batch_idx] 38 | 39 | start = random.randint(0, len(episode) - sequence_length) 40 | transitions = episode[start:start + sequence_length] 41 | batch = Transition(*zip(*transitions)) 42 | 43 | batch_state.append(torch.stack(list(batch.state))) 44 | batch_next_state.append(torch.stack(list(batch.next_state))) 45 | batch_action.append(torch.Tensor(list(batch.action))) 46 | batch_reward.append(torch.Tensor(list(batch.reward))) 47 | batch_mask.append(torch.Tensor(list(batch.mask))) 48 | 49 | return Transition(batch_state, batch_next_state, batch_action, batch_reward, batch_mask) 50 | 51 | def __len__(self): 52 | return len(self.memory) -------------------------------------------------------------------------------- /POMDP/2-DRQN-Stack/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from config import gamma, device, batch_size, sequence_length, burn_in_length 6 | 7 | class DRQN(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | super(DRQN, self).__init__() 10 | self.num_inputs = num_inputs 11 | self.num_outputs = num_outputs 12 | 13 | self.lstm = nn.LSTM(input_size=num_inputs, hidden_size=128, batch_first=True) 14 | self.fc1 = nn.Linear(128, 256) 15 | self.fc2 = nn.Linear(256, num_outputs) 16 | 17 | for m in self.modules(): 18 | if isinstance(m, nn.Linear): 19 | nn.init.xavier_uniform(m.weight) 20 | 21 | def forward(self, x): 22 | # x [batch_size, sequence_length, num_inputs] 23 | out, hidden = self.lstm(x) 24 | out = F.relu(self.fc1(out)) 25 | qvalue = self.fc2(out) 26 | 27 | return qvalue 28 | 29 | 30 | @classmethod 31 | def train_model(cls, online_net, target_net, optimizer, batch): 32 | def slice_burn_in(item): 33 | return item[:, burn_in_length:, :] 34 | states = torch.stack(batch.state).view(batch_size, sequence_length, online_net.num_inputs) 35 | next_states = torch.stack(batch.next_state).view(batch_size, sequence_length, online_net.num_inputs) 36 | actions = torch.stack(batch.action).view(batch_size, sequence_length, -1).long() 37 | rewards = torch.stack(batch.reward).view(batch_size, sequence_length, -1) 38 | masks = torch.stack(batch.mask).view(batch_size, sequence_length, -1) 39 | 40 | pred = online_net(states) 41 | next_pred = target_net(next_states) 42 | 43 | pred = slice_burn_in(pred) 44 | next_pred = slice_burn_in(next_pred) 45 | actions = slice_burn_in(actions) 46 | rewards = slice_burn_in(rewards) 47 | masks = slice_burn_in(masks) 48 | 49 | pred = pred.gather(2, actions) 50 | 51 | target = rewards + masks * gamma * next_pred.max(2, keepdim=True)[0] 52 | 53 | loss = F.mse_loss(pred, target.detach()) 54 | optimizer.zero_grad() 55 | loss.backward() 56 | optimizer.step() 57 | 58 | return loss 59 | 60 | def get_action(self, state_series): 61 | state_series = torch.stack(list(state_series)) 62 | state_series = state_series.unsqueeze(0) 63 | 64 | qvalue = self.forward(state_series) 65 | 66 | _, action = torch.max(qvalue, 2) 67 | 68 | return action.numpy()[0][-1] 69 | -------------------------------------------------------------------------------- /POMDP/2-DRQN-Stack/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import DRQN 11 | from memory import Memory 12 | from tensorboardX import SummaryWriter 13 | 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length 15 | 16 | from collections import deque 17 | 18 | def get_action(state_series, target_net, epsilon, env): 19 | if np.random.rand() <= epsilon or len(state_series) < sequence_length: 20 | return env.action_space.sample() 21 | else: 22 | return target_net.get_action(state_series) 23 | 24 | def update_target_model(online_net, target_net): 25 | # Target <- Net 26 | target_net.load_state_dict(online_net.state_dict()) 27 | 28 | 29 | def main(): 30 | env = gym.make(env_name) 31 | env.seed(500) 32 | torch.manual_seed(500) 33 | 34 | # num_inputs = env.observation_space.shape[0] 35 | num_inputs = 2 36 | num_actions = env.action_space.n 37 | print('state size:', num_inputs) 38 | print('action size:', num_actions) 39 | 40 | online_net = DRQN(num_inputs, num_actions) 41 | target_net = DRQN(num_inputs, num_actions) 42 | update_target_model(online_net, target_net) 43 | 44 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 45 | writer = SummaryWriter('logs') 46 | 47 | online_net.to(device) 48 | target_net.to(device) 49 | online_net.train() 50 | target_net.train() 51 | memory = Memory(replay_memory_capacity) 52 | running_score = 0 53 | epsilon = 1.0 54 | steps = 0 55 | loss = 0 56 | 57 | for e in range(30000): 58 | done = False 59 | 60 | state_series = deque(maxlen=sequence_length) 61 | score = 0 62 | state = env.reset() 63 | state = torch.Tensor(state[[0, 2]]).to(device) 64 | 65 | # state = torch.Tensor(state).to(device) 66 | 67 | while not done: 68 | steps += 1 69 | state_series.append(state) 70 | 71 | action = get_action(state_series, target_net, epsilon, env) 72 | next_state, reward, done, _ = env.step(action) 73 | 74 | next_state = torch.Tensor(next_state[[0, 2]]) 75 | # next_state = torch.Tensor(next_state) 76 | 77 | mask = 0 if done else 1 78 | reward = reward if not done or score == 499 else -1 79 | 80 | memory.push(state, next_state, action, reward, mask) 81 | 82 | score += reward 83 | state = next_state 84 | 85 | 86 | if steps > initial_exploration and len(memory) > batch_size: 87 | epsilon -= 0.00005 88 | epsilon = max(epsilon, 0.1) 89 | 90 | batch = memory.sample(batch_size) 91 | loss = DRQN.train_model(online_net, target_net, optimizer, batch) 92 | 93 | if steps % update_target == 0: 94 | update_target_model(online_net, target_net) 95 | 96 | score = score if score == 500.0 else score + 1 97 | if running_score == 0: 98 | running_score = score 99 | else: 100 | running_score = 0.99 * running_score + 0.01 * score 101 | if e % log_interval == 0: 102 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 103 | e, running_score, epsilon)) 104 | writer.add_scalar('log/score', float(running_score), e) 105 | writer.add_scalar('log/loss', float(loss), e) 106 | 107 | if running_score > goal_score: 108 | break 109 | 110 | 111 | if __name__=="__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /POMDP/3-DRQN-Store-State/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 100 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | sequence_length = 8 15 | burn_in_length = 4 -------------------------------------------------------------------------------- /POMDP/3-DRQN-Store-State/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | from config import sequence_length 4 | import numpy as np 5 | import torch 6 | 7 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask', 'rnn_state')) 8 | 9 | class Memory(object): 10 | def __init__(self, capacity): 11 | self.memory = deque(maxlen=capacity) 12 | self.local_memory = [] 13 | self.capacity = capacity 14 | 15 | def push(self, state, next_state, action, reward, mask, rnn_state): 16 | self.local_memory.append(Transition(state, next_state, action, reward, mask, torch.stack(rnn_state).view(2, -1))) 17 | if mask == 0: 18 | self.memory.append(self.local_memory) 19 | self.local_memory = [] 20 | 21 | def sample(self, batch_size): 22 | batch_state, batch_next_state, batch_action, batch_reward, batch_mask, batch_rnn_state = [], [], [], [], [], [] 23 | p = np.array([len(episode) for episode in self.memory]) 24 | p = p / p.sum() 25 | 26 | batch_indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p) 27 | 28 | for batch_idx in batch_indexes: 29 | episode = self.memory[batch_idx] 30 | 31 | start = random.randint(0, len(episode) - sequence_length) 32 | transitions = episode[start:start + sequence_length] 33 | batch = Transition(*zip(*transitions)) 34 | 35 | batch_state.append(torch.stack(list(batch.state))) 36 | batch_next_state.append(torch.stack(list(batch.next_state))) 37 | batch_action.append(torch.Tensor(list(batch.action))) 38 | batch_reward.append(torch.Tensor(list(batch.reward))) 39 | batch_mask.append(torch.Tensor(list(batch.mask))) 40 | batch_rnn_state.append(torch.stack(list(batch.rnn_state))) 41 | 42 | return Transition(batch_state, batch_next_state, batch_action, batch_reward, batch_mask, batch_rnn_state) 43 | 44 | def __len__(self): 45 | return len(self.memory) -------------------------------------------------------------------------------- /POMDP/3-DRQN-Store-State/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from config import gamma, device, batch_size, sequence_length, burn_in_length 6 | 7 | class DRQN(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | super(DRQN, self).__init__() 10 | self.num_inputs = num_inputs 11 | self.num_outputs = num_outputs 12 | 13 | self.lstm = nn.LSTM(input_size=num_inputs, hidden_size=16, batch_first=True) 14 | self.fc1 = nn.Linear(16, 128) 15 | self.fc2 = nn.Linear(128, num_outputs) 16 | 17 | for m in self.modules(): 18 | if isinstance(m, nn.Linear): 19 | nn.init.xavier_uniform(m.weight) 20 | 21 | def forward(self, x, hidden=None): 22 | # x [batch_size, sequence_length, num_inputs] 23 | out, hidden = self.lstm(x, hidden) 24 | 25 | out = F.relu(self.fc1(out)) 26 | qvalue = self.fc2(out) 27 | 28 | return qvalue, hidden 29 | 30 | 31 | @classmethod 32 | def train_model(cls, online_net, target_net, optimizer, batch): 33 | def slice_burn_in(item): 34 | return item[:, burn_in_length:, :] 35 | states = torch.stack(batch.state).view(batch_size, sequence_length, online_net.num_inputs) 36 | next_states = torch.stack(batch.next_state).view(batch_size, sequence_length, online_net.num_inputs) 37 | actions = torch.stack(batch.action).view(batch_size, sequence_length, -1).long() 38 | rewards = torch.stack(batch.reward).view(batch_size, sequence_length, -1) 39 | masks = torch.stack(batch.mask).view(batch_size, sequence_length, -1) 40 | rnn_state = torch.stack(batch.rnn_state).view(batch_size, sequence_length, 2, -1) 41 | 42 | 43 | 44 | [h0, c0] = rnn_state[:, 0, :, :].transpose(0, 1) 45 | h0 = h0.unsqueeze(0).detach() 46 | c0 = c0.unsqueeze(0).detach() 47 | 48 | [h1, c1] = rnn_state[:, 1, :, :].transpose(0, 1) 49 | h1 = h1.unsqueeze(0).detach() 50 | c1 = c1.unsqueeze(0).detach() 51 | 52 | pred, _ = online_net(states, (h0, c0)) 53 | next_pred, _ = target_net(next_states, (h1, c1)) 54 | 55 | pred = slice_burn_in(pred) 56 | next_pred = slice_burn_in(next_pred) 57 | actions = slice_burn_in(actions) 58 | rewards = slice_burn_in(rewards) 59 | masks = slice_burn_in(masks) 60 | 61 | pred = pred.gather(2, actions) 62 | 63 | target = rewards + masks * gamma * next_pred.max(2, keepdim=True)[0] 64 | 65 | loss = F.mse_loss(pred, target.detach()) 66 | optimizer.zero_grad() 67 | loss.backward() 68 | optimizer.step() 69 | 70 | return loss 71 | 72 | def get_action(self, state, hidden): 73 | state = state.unsqueeze(0).unsqueeze(0) 74 | 75 | qvalue, hidden = self.forward(state, hidden) 76 | 77 | _, action = torch.max(qvalue, 2) 78 | return action.numpy()[0][0], hidden 79 | -------------------------------------------------------------------------------- /POMDP/3-DRQN-Store-State/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import DRQN 11 | from memory import Memory 12 | from tensorboardX import SummaryWriter 13 | 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length 15 | 16 | from collections import deque 17 | 18 | def get_action(state, target_net, epsilon, env, hidden): 19 | action, hidden = target_net.get_action(state, hidden) 20 | 21 | if np.random.rand() <= epsilon: 22 | return env.action_space.sample(), hidden 23 | else: 24 | return action, hidden 25 | 26 | def update_target_model(online_net, target_net): 27 | # Target <- Net 28 | target_net.load_state_dict(online_net.state_dict()) 29 | 30 | def state_to_partial_observability(state): 31 | state = state[[0, 2]] 32 | return state 33 | 34 | 35 | def main(): 36 | env = gym.make(env_name) 37 | env.seed(500) 38 | torch.manual_seed(500) 39 | 40 | # num_inputs = env.observation_space.shape[0] 41 | num_inputs = 2 42 | num_actions = env.action_space.n 43 | print('state size:', num_inputs) 44 | print('action size:', num_actions) 45 | 46 | online_net = DRQN(num_inputs, num_actions) 47 | target_net = DRQN(num_inputs, num_actions) 48 | update_target_model(online_net, target_net) 49 | 50 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 51 | writer = SummaryWriter('logs') 52 | 53 | online_net.to(device) 54 | target_net.to(device) 55 | online_net.train() 56 | target_net.train() 57 | memory = Memory(replay_memory_capacity) 58 | running_score = 0 59 | epsilon = 1.0 60 | steps = 0 61 | loss = 0 62 | 63 | for e in range(30000): 64 | done = False 65 | 66 | score = 0 67 | state = env.reset() 68 | state = state_to_partial_observability(state) 69 | state = torch.Tensor(state).to(device) 70 | 71 | hidden = (torch.Tensor().new_zeros(1, 1, 16), torch.Tensor().new_zeros(1, 1, 16)) 72 | 73 | while not done: 74 | steps += 1 75 | 76 | action, new_hidden = get_action(state, target_net, epsilon, env, hidden) 77 | next_state, reward, done, _ = env.step(action) 78 | 79 | next_state = state_to_partial_observability(next_state) 80 | next_state = torch.Tensor(next_state) 81 | 82 | mask = 0 if done else 1 83 | reward = reward if not done or score == 499 else -1 84 | 85 | memory.push(state, next_state, action, reward, mask, hidden) 86 | hidden = new_hidden 87 | 88 | score += reward 89 | state = next_state 90 | 91 | 92 | if steps > initial_exploration and len(memory) > batch_size: 93 | epsilon -= 0.00005 94 | epsilon = max(epsilon, 0.1) 95 | 96 | batch = memory.sample(batch_size) 97 | loss = DRQN.train_model(online_net, target_net, optimizer, batch) 98 | 99 | if steps % update_target == 0: 100 | update_target_model(online_net, target_net) 101 | 102 | score = score if score == 500.0 else score + 1 103 | if running_score == 0: 104 | running_score = score 105 | else: 106 | running_score = 0.99 * running_score + 0.01 * score 107 | if e % log_interval == 0: 108 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 109 | e, running_score, epsilon)) 110 | writer.add_scalar('log/score', float(running_score), e) 111 | writer.add_scalar('log/loss', float(loss), e) 112 | 113 | if running_score > goal_score: 114 | break 115 | 116 | 117 | if __name__=="__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /POMDP/4-R2D2-Single/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | sequence_length = 32 15 | burn_in_length = 4 16 | eta = 0.9 17 | local_mini_batch = 8 18 | n_step = 2 19 | over_lapping_length = 16 -------------------------------------------------------------------------------- /POMDP/4-R2D2-Single/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from config import gamma, device, batch_size, sequence_length, burn_in_length 6 | 7 | class R2D2(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | super(R2D2, self).__init__() 10 | self.num_inputs = num_inputs 11 | self.num_outputs = num_outputs 12 | 13 | self.lstm = nn.LSTM(input_size=num_inputs, hidden_size=16, batch_first=True) 14 | self.fc = nn.Linear(16, 128) 15 | self.fc_adv = nn.Linear(128, num_outputs) 16 | self.fc_val = nn.Linear(128, 1) 17 | 18 | for m in self.modules(): 19 | if isinstance(m, nn.Linear): 20 | nn.init.xavier_uniform(m.weight) 21 | 22 | def forward(self, x, hidden=None): 23 | # x [batch_size, sequence_length, num_inputs] 24 | batch_size = x.size()[0] 25 | sequence_length = x.size()[1] 26 | out, hidden = self.lstm(x, hidden) 27 | 28 | out = F.relu(self.fc(out)) 29 | adv = self.fc_adv(out) 30 | adv = adv.view(batch_size, sequence_length, self.num_outputs) 31 | val = self.fc_val(out) 32 | val = val.view(batch_size, sequence_length, 1) 33 | 34 | qvalue = val + (adv - adv.mean(dim=2, keepdim=True)) 35 | 36 | return qvalue, hidden 37 | 38 | @classmethod 39 | def get_td_error(cls, online_net, target_net, batch, lengths): 40 | def slice_burn_in(item): 41 | return item[:, burn_in_length:, :] 42 | batch_size = torch.stack(batch.state).size()[0] 43 | states = torch.stack(batch.state).view(batch_size, sequence_length, online_net.num_inputs) 44 | next_states = torch.stack(batch.next_state).view(batch_size, sequence_length, online_net.num_inputs) 45 | actions = torch.stack(batch.action).view(batch_size, sequence_length, -1).long() 46 | rewards = torch.stack(batch.reward).view(batch_size, sequence_length, -1) 47 | masks = torch.stack(batch.mask).view(batch_size, sequence_length, -1) 48 | steps = torch.stack(batch.step).view(batch_size, sequence_length, -1) 49 | rnn_state = torch.stack(batch.rnn_state).view(batch_size, sequence_length, 2, -1) 50 | 51 | [h0, c0] = rnn_state[:, 0, :, :].transpose(0, 1) 52 | h0 = h0.unsqueeze(0).detach() 53 | c0 = c0.unsqueeze(0).detach() 54 | 55 | [h1, c1] = rnn_state[:, 1, :, :].transpose(0, 1) 56 | h1 = h1.unsqueeze(0).detach() 57 | c1 = c1.unsqueeze(0).detach() 58 | 59 | pred, _ = online_net(states, (h0, c0)) 60 | next_pred, _ = target_net(next_states, (h1, c1)) 61 | 62 | next_pred_online, _ = online_net(next_states, (h1, c1)) 63 | 64 | pred = slice_burn_in(pred) 65 | next_pred = slice_burn_in(next_pred) 66 | actions = slice_burn_in(actions) 67 | rewards = slice_burn_in(rewards) 68 | masks = slice_burn_in(masks) 69 | steps = slice_burn_in(steps) 70 | next_pred_online = slice_burn_in(next_pred_online) 71 | 72 | pred = pred.gather(2, actions) 73 | 74 | _, next_pred_online_action = next_pred_online.max(2) 75 | 76 | target = rewards + masks * pow(gamma, steps) * next_pred.gather(2, next_pred_online_action.unsqueeze(2)) 77 | 78 | td_error = pred - target.detach() 79 | 80 | for idx, length in enumerate(lengths): 81 | td_error[idx][length-burn_in_length:][:] = 0 82 | 83 | return td_error 84 | 85 | @classmethod 86 | def train_model(cls, online_net, target_net, optimizer, batch, lengths): 87 | td_error = cls.get_td_error(online_net, target_net, batch, lengths) 88 | 89 | loss = pow(td_error, 2).mean() 90 | 91 | optimizer.zero_grad() 92 | loss.backward() 93 | optimizer.step() 94 | 95 | return loss, td_error 96 | 97 | def get_action(self, state, hidden): 98 | state = state.unsqueeze(0).unsqueeze(0) 99 | 100 | qvalue, hidden = self.forward(state, hidden) 101 | 102 | _, action = torch.max(qvalue, 2) 103 | return action.numpy()[0][0], hidden 104 | -------------------------------------------------------------------------------- /POMDP/4-R2D2-Single/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import R2D2 11 | from memory import Memory, LocalBuffer 12 | from tensorboardX import SummaryWriter 13 | 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length, local_mini_batch 15 | 16 | from collections import deque 17 | 18 | def get_action(state, target_net, epsilon, env, hidden): 19 | action, hidden = target_net.get_action(state, hidden) 20 | 21 | if np.random.rand() <= epsilon: 22 | return env.action_space.sample(), hidden 23 | else: 24 | return action, hidden 25 | 26 | def update_target_model(online_net, target_net): 27 | # Target <- Net 28 | target_net.load_state_dict(online_net.state_dict()) 29 | 30 | def state_to_partial_observability(state): 31 | state = state[[0, 2]] 32 | return state 33 | 34 | def main(): 35 | env = gym.make(env_name) 36 | env.seed(500) 37 | torch.manual_seed(500) 38 | 39 | # num_inputs = env.observation_space.shape[0] 40 | num_inputs = 2 41 | num_actions = env.action_space.n 42 | print('state size:', num_inputs) 43 | print('action size:', num_actions) 44 | 45 | online_net = R2D2(num_inputs, num_actions) 46 | target_net = R2D2(num_inputs, num_actions) 47 | update_target_model(online_net, target_net) 48 | 49 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 50 | writer = SummaryWriter('logs') 51 | 52 | online_net.to(device) 53 | target_net.to(device) 54 | online_net.train() 55 | target_net.train() 56 | memory = Memory(replay_memory_capacity) 57 | running_score = 0 58 | epsilon = 1.0 59 | steps = 0 60 | loss = 0 61 | local_buffer = LocalBuffer() 62 | 63 | for e in range(30000): 64 | done = False 65 | 66 | score = 0 67 | state = env.reset() 68 | state = state_to_partial_observability(state) 69 | state = torch.Tensor(state).to(device) 70 | 71 | hidden = (torch.Tensor().new_zeros(1, 1, 16), torch.Tensor().new_zeros(1, 1, 16)) 72 | 73 | while not done: 74 | steps += 1 75 | 76 | action, new_hidden = get_action(state, target_net, epsilon, env, hidden) 77 | 78 | next_state, reward, done, _ = env.step(action) 79 | 80 | next_state = state_to_partial_observability(next_state) 81 | next_state = torch.Tensor(next_state) 82 | 83 | mask = 0 if done else 1 84 | reward = reward if not done or score == 499 else -1 85 | 86 | local_buffer.push(state, next_state, action, reward, mask, hidden) 87 | hidden = new_hidden 88 | if len(local_buffer.memory) == local_mini_batch: 89 | batch, lengths = local_buffer.sample() 90 | td_error = R2D2.get_td_error(online_net, target_net, batch, lengths) 91 | memory.push(td_error, batch, lengths) 92 | 93 | score += reward 94 | state = next_state 95 | 96 | if steps > initial_exploration and len(memory) > batch_size: 97 | epsilon -= 0.00005 98 | epsilon = max(epsilon, 0.1) 99 | 100 | batch, indexes, lengths = memory.sample(batch_size) 101 | loss, td_error = R2D2.train_model(online_net, target_net, optimizer, batch, lengths) 102 | 103 | memory.update_prior(indexes, td_error, lengths) 104 | 105 | if steps % update_target == 0: 106 | update_target_model(online_net, target_net) 107 | 108 | score = score if score == 500.0 else score + 1 109 | if running_score == 0: 110 | running_score = score 111 | else: 112 | running_score = 0.99 * running_score + 0.01 * score 113 | if e % log_interval == 0: 114 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 115 | e, running_score, epsilon)) 116 | writer.add_scalar('log/score', float(running_score), e) 117 | writer.add_scalar('log/loss', float(loss), e) 118 | 119 | if running_score > goal_score: 120 | break 121 | 122 | 123 | if __name__=="__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch CartPole Example 2 | Simple Cartpole example writed with pytorch. 3 | 4 | ## Why Cartpole? 5 | Cartpole is very easy problem and is converged very fast in many case. 6 | So you can run this example in your computer(maybe it take just only 1~2 minitue). 7 | 8 | ## Rainbow 9 | - [x] DQN [[1]](#reference) 10 | - [x] Double [[2]](#reference) 11 | - [x] Duel [[3]](#reference) 12 | - [x] Multi-step [[4]](#reference) 13 | - [x] PER(Prioritized Experience Replay) [[5]](#reference) 14 | - [x] Nosiy-Net [[6]](#reference) 15 | - [x] Distributional(C51) [[7]](#reference) 16 | - [x] Rainbow [[8]](#reference) 17 | 18 | ## PG(Policy Gradient) 19 | - [x] REINFORCE [[9]](#reference) 20 | - [x] Actor Critic [[10]](#reference) 21 | - [x] Advantage Actor Critic 22 | - [x] GAE(Generalized Advantage Estimation) [[12]](#reference) 23 | - [x] TNPG [[20]](#reference) 24 | - [x] TRPO [[13]](#reference) 25 | - [x] PPO - Single Version [[14]](#reference) 26 | 27 | ## Parallel 28 | - [x] Asynchronous Q-learning [[11]](#reference) 29 | - [x] A3C (Asynchronous Advantage Actor Critic) [[11]](#reference) 30 | - [x] ACER [[21]](#reference) 31 | - [ ] PPO [[14]](#reference) 32 | - [x] APE-X DQN [[15]](#reference) 33 | - [ ] IMPALA [[23]](#reference) 34 | - [ ] R2D2 [[16]](#reference) 35 | 36 | ## Distributional DQN 37 | - [x] QRDQN [[18]](#reference) 38 | - [x] IQN [[19]](#reference) 39 | 40 | ## Exploration 41 | - [ ] ICM [[22]](#refercence) 42 | - [ ] RND [[17]](#reference) 43 | 44 | ## POMDP (With RNN) 45 | - [x] DQN (use state stack) 46 | - [x] DRQN [[24]](#reference) [[25]](#reference) 47 | - [x] DRQN (use state stack) 48 | - [x] DRQN (store Rnn State) [[16]](#reference) 49 | - [x] R2D2 - Single Version [[16]](#reference) 50 | 51 | 52 | ## Reference 53 | [1][Playing Atari with Deep Reinforcement Learning](http://arxiv.org/abs/1312.5602) 54 | [2][Deep Reinforcement Learning with Double Q-learning](http://arxiv.org/abs/1509.06461) 55 | [3][Dueling Network Architectures for Deep Reinforcement Learning](http://arxiv.org/abs/1511.06581) 56 | [4][Reinforcement Learning: An Introduction](http://www.incompleteideas.net/sutton/book/ebook/the-book.html) 57 | [5][Prioritized Experience Replay](http://arxiv.org/abs/1511.05952) 58 | [6][Noisy Networks for Exploration](https://arxiv.org/abs/1706.10295) 59 | [7][A Distributional Perspective on Reinforcement Learning](https://arxiv.org/abs/1707.06887) 60 | [8][Rainbow: Combining Improvements in Deep Reinforcement Learning](https://arxiv.org/abs/1710.02298) 61 | [9][Policy Gradient Methods for Reinforcement Learning with Function Approximation ](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) 62 | [10][Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf) 63 | [11][Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) 64 | [12][HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION](https://arxiv.org/pdf/1506.02438.pdf) 65 | [13][Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) 66 | [14][Proximal Policy Optimization](https://arxiv.org/pdf/1707.06347.pdf) 67 | [15][DISTRIBUTED PRIORITIZED EXPERIENCE REPLAY](https://arxiv.org/pdf/1803.00933.pdf) 68 | [16][RECURRENT EXPERIENCE REPLAY IN DISTRIBUTED REINFORCEMENT LEARNING](https://openreview.net/pdf?id=r1lyTjAqYX) 69 | [17][EXPLORATION BY RANDOM NETWORK DISTILLATION](https://openreview.net/pdf?id=H1lJJnR5Ym) 70 | [18][Distributional Reinforcement Learning with Quantile Regression](https://arxiv.org/pdf/1710.10044.pdf) 71 | [19][Implicit Quantile Networks for Distributional Reinforcement Learning](https://arxiv.org/pdf/1806.06923.pdf) 72 | [20][A Natural Policy Gradient](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf) 73 | [21][SAMPLE EFFICIENT ACTOR-CRITIC WITH EXPERIENCE REPLAY](https://arxiv.org/pdf/1611.01224.pdf) 74 | [22][Curiosity-driven Exploration by Self-supervised Prediction](https://arxiv.org/pdf/1705.05363.pdf) 75 | [23][IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures](https://arxiv.org/pdf/1802.01561.pdf) 76 | [24][Deep Recurrent Q-Learning for Partially Observable MDPs](https://arxiv.org/pdf/1507.06527.pdf) 77 | [25][Playing FPS Games with Deep Reinforcement Learning](https://arxiv.org/pdf/1609.05521.pdf) 78 | 79 | ## Acknowledgements 80 | - https://github.com/openai/baselines 81 | - https://github.com/reinforcement-learning-kr/pg_travel 82 | - https://github.com/reinforcement-learning-kr/distributional_rl 83 | - https://github.com/Kaixhin/Rainbow 84 | - https://github.com/Kaixhin/ACER 85 | - https://github.com/higgsfield/RL-Adventure-2 86 | 87 | ## Use Cuda 88 | check this issue. https://github.com/g6ling/Reinforcement-Learning-Pytorch-Cartpole/issues/1 89 | -------------------------------------------------------------------------------- /distributional/1-QR-DQN/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | 15 | num_support = 8 16 | -------------------------------------------------------------------------------- /distributional/1-QR-DQN/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 6 | 7 | 8 | class Memory(object): 9 | def __init__(self, capacity): 10 | self.memory = deque(maxlen=capacity) 11 | self.capacity = capacity 12 | 13 | def push(self, state, next_state, action, reward, mask): 14 | self.memory.append(Transition(state, next_state, action, reward, mask)) 15 | 16 | def sample(self, batch_size): 17 | transitions = random.sample(self.memory, batch_size) 18 | batch = Transition(*zip(*transitions)) 19 | return batch 20 | 21 | def __len__(self): 22 | return len(self.memory) 23 | -------------------------------------------------------------------------------- /distributional/1-QR-DQN/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from config import num_support, batch_size, gamma 7 | 8 | class QRDQN(nn.Module): 9 | def __init__(self, num_inputs, num_outputs): 10 | super(QRDQN, self).__init__() 11 | self.num_inputs = num_inputs 12 | self.num_outputs = num_outputs 13 | 14 | self.num_support = num_support 15 | 16 | self.fc1 = nn.Linear(num_inputs, 128) 17 | self.fc2 = nn.Linear(128, num_outputs * num_support) 18 | 19 | for m in self.modules(): 20 | if isinstance(m, nn.Linear): 21 | nn.init.xavier_uniform(m.weight) 22 | 23 | def forward(self, state): 24 | x = F.relu(self.fc1(state)) 25 | x = self.fc2(x) 26 | theta = x.view(-1, self.num_outputs, self.num_support) 27 | 28 | return theta 29 | 30 | def get_action(self, state): 31 | theta = self.forward(state) 32 | Q = theta.mean(dim=2, keepdim=True) 33 | action = torch.argmax(Q) 34 | return action.item() 35 | 36 | @classmethod 37 | def train_model(cls, online_net, target_net, optimizer, batch): 38 | states = torch.stack(batch.state) 39 | next_states = torch.stack(batch.next_state) 40 | actions = torch.Tensor(batch.action).long() 41 | rewards = torch.Tensor(batch.reward) 42 | masks = torch.Tensor(batch.mask) 43 | 44 | theta = online_net(states) 45 | action = actions.unsqueeze(1).unsqueeze(1).expand(-1, 1, num_support) 46 | theta_a = theta.gather(1, action).squeeze(1) 47 | 48 | next_theta = target_net(next_states) # batch_size * action * num_support 49 | next_action = next_theta.mean(dim=2).max(1)[1] # batch_size 50 | next_action = next_action.unsqueeze(1).unsqueeze(1).expand(batch_size, 1, num_support) 51 | next_theta_a = next_theta.gather(1, next_action).squeeze(1) # batch_size * num_support 52 | 53 | T_theta = rewards.unsqueeze(1) + gamma * next_theta_a * masks.unsqueeze(1) 54 | 55 | T_theta_tile = T_theta.view(-1, num_support, 1).expand(-1, num_support, num_support) 56 | theta_a_tile = theta_a.view(-1, 1, num_support).expand(-1, num_support, num_support) 57 | 58 | error_loss = T_theta_tile - theta_a_tile 59 | huber_loss = F.smooth_l1_loss(theta_a_tile, T_theta_tile.detach(), reduction='none') 60 | tau = torch.arange(0.5 * (1 / num_support), 1, 1 / num_support).view(1, num_support) 61 | 62 | loss = (tau - (error_loss < 0).float()).abs() * huber_loss 63 | loss = loss.mean(dim=2).sum(dim=1).mean() 64 | 65 | optimizer.zero_grad() 66 | loss.backward() 67 | optimizer.step() 68 | 69 | return loss -------------------------------------------------------------------------------- /distributional/1-QR-DQN/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from tensorboardX import SummaryWriter 12 | 13 | from model import QRDQN 14 | from memory import Memory 15 | 16 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr 17 | 18 | 19 | def get_action(state, target_net, epsilon, env): 20 | if np.random.rand() <= epsilon: 21 | return env.action_space.sample() 22 | else: 23 | return target_net.get_action(state) 24 | 25 | def update_target_model(online_net, target_net): 26 | # Target <- Net 27 | target_net.load_state_dict(online_net.state_dict()) 28 | 29 | 30 | 31 | def main(): 32 | env = gym.make(env_name) 33 | env.seed(500) 34 | torch.manual_seed(500) 35 | 36 | num_inputs = env.observation_space.shape[0] 37 | num_actions = env.action_space.n 38 | print('state size:', num_inputs) 39 | print('action size:', num_actions) 40 | 41 | online_net = QRDQN(num_inputs, num_actions) 42 | target_net = QRDQN(num_inputs, num_actions) 43 | update_target_model(online_net, target_net) 44 | 45 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 46 | writer = SummaryWriter('logs') 47 | 48 | online_net.to(device) 49 | target_net.to(device) 50 | online_net.train() 51 | target_net.train() 52 | memory = Memory(replay_memory_capacity) 53 | running_score = 0 54 | epsilon = 1.0 55 | steps = 0 56 | loss = 0 57 | 58 | for e in range(3000): 59 | done = False 60 | 61 | score = 0 62 | state = env.reset() 63 | state = torch.Tensor(state) 64 | state = state.unsqueeze(0) 65 | 66 | while not done: 67 | steps += 1 68 | action = get_action(state, target_net, epsilon, env) 69 | next_state, reward, done, _ = env.step(action) 70 | 71 | next_state = torch.Tensor(next_state) 72 | next_state = next_state.unsqueeze(0) 73 | 74 | mask = 0 if done else 1 75 | reward = reward if not done or score == 499 else -1 76 | memory.push(state, next_state, action, reward, mask) 77 | 78 | score += reward 79 | state = next_state 80 | 81 | if steps > initial_exploration: 82 | epsilon -= 0.00005 83 | epsilon = max(epsilon, 0.1) 84 | 85 | batch = memory.sample(batch_size) 86 | loss = QRDQN.train_model(online_net, target_net, optimizer, batch) 87 | 88 | if steps % update_target == 0: 89 | update_target_model(online_net, target_net) 90 | 91 | score = score if score == 500.0 else score + 1 92 | running_score = 0.99 * running_score + 0.01 * score 93 | if e % log_interval == 0: 94 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 95 | e, running_score, epsilon)) 96 | writer.add_scalar('log/score', float(running_score), e) 97 | writer.add_scalar('log/loss', float(loss), e) 98 | 99 | if running_score > goal_score: 100 | break 101 | 102 | if __name__=="__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /distributional/2-IQN/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | 15 | num_quantile_sample = 32 16 | num_tau_sample = 16 17 | num_tau_prime_sample = 8 18 | quantile_embedding_dim = 64 -------------------------------------------------------------------------------- /distributional/2-IQN/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 6 | 7 | 8 | class Memory(object): 9 | def __init__(self, capacity): 10 | self.memory = deque(maxlen=capacity) 11 | self.capacity = capacity 12 | 13 | def push(self, state, next_state, action, reward, mask): 14 | self.memory.append(Transition(state, next_state, action, reward, mask)) 15 | 16 | def sample(self, batch_size): 17 | transitions = random.sample(self.memory, batch_size) 18 | batch = Transition(*zip(*transitions)) 19 | return batch 20 | 21 | def __len__(self): 22 | return len(self.memory) 23 | -------------------------------------------------------------------------------- /distributional/2-IQN/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from config import batch_size, gamma, quantile_embedding_dim, num_tau_sample, num_tau_prime_sample, num_quantile_sample 7 | 8 | class IQN(nn.Module): 9 | def __init__(self, num_inputs, num_outputs): 10 | super(IQN, self).__init__() 11 | self.num_inputs = num_inputs 12 | self.num_outputs = num_outputs 13 | 14 | self.fc1 = nn.Linear(num_inputs, 128) 15 | self.fc2 = nn.Linear(128, num_outputs) 16 | self.phi = nn.Linear(quantile_embedding_dim, 128) 17 | 18 | for m in self.modules(): 19 | if isinstance(m, nn.Linear): 20 | nn.init.xavier_uniform(m.weight) 21 | 22 | def forward(self, state, tau, num_quantiles): 23 | input_size = state.size()[0] # batch_size(train) or 1(get_action) 24 | tau = tau.expand(input_size * num_quantiles, quantile_embedding_dim) 25 | pi_mtx = torch.Tensor(np.pi * np.arange(0, quantile_embedding_dim)).expand(input_size * num_quantiles, quantile_embedding_dim) 26 | cos_tau = torch.cos(tau * pi_mtx) 27 | 28 | phi = self.phi(cos_tau) 29 | phi = F.relu(phi) 30 | 31 | state_tile = state.expand(input_size, num_quantiles, self.num_inputs) 32 | state_tile = state_tile.flatten().view(-1, self.num_inputs) 33 | 34 | x = F.relu(self.fc1(state_tile)) 35 | x = self.fc2(x * phi) 36 | z = x.view(-1, num_quantiles, self.num_outputs) 37 | 38 | z = z.transpose(1, 2) # [input_size, num_output, num_quantile] 39 | return z 40 | 41 | def get_action(self, state): 42 | tau = torch.Tensor(np.random.rand(num_quantile_sample, 1) * 0.5) # CVaR 43 | z = self.forward(state, tau, num_quantile_sample) 44 | q = z.mean(dim=2, keepdim=True) 45 | action = torch.argmax(q) 46 | return action.item() 47 | 48 | @classmethod 49 | def train_model(cls, online_net, target_net, optimizer, batch): 50 | states = torch.stack(batch.state) 51 | next_states = torch.stack(batch.next_state) 52 | actions = torch.Tensor(batch.action).long() 53 | rewards = torch.Tensor(batch.reward) 54 | masks = torch.Tensor(batch.mask) 55 | 56 | tau = torch.Tensor(np.random.rand(batch_size * num_tau_sample, 1)) 57 | z = online_net(states, tau, num_tau_sample) 58 | action = actions.unsqueeze(1).unsqueeze(1).expand(-1, 1, num_tau_sample) 59 | z_a = z.gather(1, action).squeeze(1) 60 | 61 | tau_prime = torch.Tensor(np.random.rand(batch_size * num_tau_prime_sample, 1)) 62 | next_z = target_net(next_states, tau_prime, num_tau_prime_sample) 63 | next_action = next_z.mean(dim=2).max(1)[1] 64 | next_action = next_action.unsqueeze(1).unsqueeze(1).expand(batch_size, 1, num_tau_prime_sample) 65 | next_z_a = next_z.gather(1, next_action).squeeze(1) 66 | 67 | T_z = rewards.unsqueeze(1) + gamma * next_z_a * masks.unsqueeze(1) 68 | 69 | T_z_tile = T_z.view(-1, num_tau_prime_sample, 1).expand(-1, num_tau_prime_sample, num_tau_sample) 70 | z_a_tile = z_a.view(-1, 1, num_tau_sample).expand(-1, num_tau_prime_sample, num_tau_sample) 71 | 72 | error_loss = T_z_tile - z_a_tile 73 | huber_loss = F.smooth_l1_loss(z_a_tile, T_z_tile.detach(), reduction='none') 74 | tau = torch.arange(0, 1, 1 / num_tau_sample).view(1, num_tau_sample) 75 | 76 | loss = (tau - (error_loss < 0).float()).abs() * huber_loss 77 | loss = loss.mean(dim=2).sum(dim=1).mean() 78 | 79 | optimizer.zero_grad() 80 | loss.backward() 81 | optimizer.step() 82 | 83 | return loss -------------------------------------------------------------------------------- /distributional/2-IQN/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from tensorboardX import SummaryWriter 12 | 13 | from model import IQN 14 | from memory import Memory 15 | 16 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr 17 | 18 | 19 | def get_action(state, target_net, epsilon, env): 20 | if np.random.rand() <= epsilon: 21 | return env.action_space.sample() 22 | else: 23 | return target_net.get_action(state) 24 | 25 | def update_target_model(online_net, target_net): 26 | # Target <- Net 27 | target_net.load_state_dict(online_net.state_dict()) 28 | 29 | 30 | def main(): 31 | env = gym.make(env_name) 32 | env.seed(500) 33 | torch.manual_seed(500) 34 | 35 | num_inputs = env.observation_space.shape[0] 36 | num_actions = env.action_space.n 37 | print('state size:', num_inputs) 38 | print('action size:', num_actions) 39 | 40 | online_net = IQN(num_inputs, num_actions) 41 | target_net = IQN(num_inputs, num_actions) 42 | update_target_model(online_net, target_net) 43 | 44 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 45 | writer = SummaryWriter('logs') 46 | 47 | online_net.to(device) 48 | target_net.to(device) 49 | online_net.train() 50 | target_net.train() 51 | memory = Memory(replay_memory_capacity) 52 | running_score = 0 53 | epsilon = 1.0 54 | steps = 0 55 | loss = 0 56 | 57 | for e in range(3000): 58 | done = False 59 | 60 | score = 0 61 | state = env.reset() 62 | state = torch.Tensor(state) 63 | state = state.unsqueeze(0) 64 | 65 | while not done: 66 | steps += 1 67 | action = get_action(state, target_net, epsilon, env) 68 | next_state, reward, done, _ = env.step(action) 69 | 70 | next_state = torch.Tensor(next_state) 71 | next_state = next_state.unsqueeze(0) 72 | 73 | mask = 0 if done else 1 74 | reward = reward if not done or score == 499 else -1 75 | memory.push(state, next_state, action, reward, mask) 76 | 77 | score += reward 78 | state = next_state 79 | 80 | if steps > initial_exploration: 81 | epsilon -= 0.00005 82 | epsilon = max(epsilon, 0.1) 83 | 84 | batch = memory.sample(batch_size) 85 | loss = IQN.train_model(online_net, target_net, optimizer, batch) 86 | 87 | if steps % update_target == 0: 88 | update_target_model(online_net, target_net) 89 | 90 | score = score if score == 500.0 else score + 1 91 | running_score = 0.99 * running_score + 0.01 * score 92 | if e % log_interval == 0: 93 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 94 | e, running_score, epsilon)) 95 | writer.add_scalar('log/score', float(running_score), e) 96 | writer.add_scalar('log/loss', float(loss), e) 97 | 98 | if running_score > goal_score: 99 | break 100 | 101 | if __name__=="__main__": 102 | main() 103 | -------------------------------------------------------------------------------- /parallel/1-Async-Q-Learning/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lr = 0.001 6 | goal_score = 200 7 | log_interval = 10 8 | update_target = 100 9 | async_update_step = 10 10 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 11 | max_episode = 30000 12 | -------------------------------------------------------------------------------- /parallel/1-Async-Q-Learning/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 5 | 6 | 7 | class Memory(object): 8 | def __init__(self, capacity): 9 | self.memory = deque(maxlen=capacity) 10 | self.capacity = capacity 11 | 12 | def push(self, state, next_state, action, reward, mask): 13 | self.memory.append(Transition(state, next_state, action, reward, mask)) 14 | 15 | def sample(self): 16 | batch = Transition(*zip(*self.memory)) 17 | return batch 18 | 19 | def __len__(self): 20 | return len(self.memory) 21 | -------------------------------------------------------------------------------- /parallel/1-Async-Q-Learning/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from config import gamma 6 | 7 | class QNet(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | super(QNet, self).__init__() 10 | self.num_inputs = num_inputs 11 | self.num_outputs = num_outputs 12 | 13 | self.fc1 = nn.Linear(num_inputs, 128) 14 | self.fc2 = nn.Linear(128, num_outputs) 15 | 16 | for m in self.modules(): 17 | if isinstance(m, nn.Linear): 18 | nn.init.xavier_uniform(m.weight) 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | qvalue = self.fc2(x) 23 | return qvalue 24 | 25 | @classmethod 26 | def train_model(cls, online_net, target_net, optimizer, batch): 27 | states = torch.stack(batch.state) 28 | next_states = torch.stack(batch.next_state) 29 | actions = torch.Tensor(batch.action).float() 30 | rewards = torch.Tensor(batch.reward) 31 | masks = torch.Tensor(batch.mask) 32 | 33 | pred = online_net(states).squeeze(1) 34 | next_pred = target_net(next_states).squeeze(1) 35 | 36 | pred = torch.sum(pred.mul(actions), dim=1) 37 | 38 | target = rewards + masks * gamma * next_pred.max(1)[0] 39 | 40 | loss = torch.sum((pred - target.detach()) ** 2) 41 | optimizer.zero_grad() 42 | loss.backward() 43 | optimizer.step() 44 | 45 | return loss 46 | 47 | def get_action(self, input): 48 | qvalue = self.forward(input) 49 | _, action = torch.max(qvalue, 1) 50 | return action.numpy()[0] 51 | -------------------------------------------------------------------------------- /parallel/1-Async-Q-Learning/shared_adam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class SharedAdam(torch.optim.Adam): # extend a pytorch optimizer so it shares grads across processes 4 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 5 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 6 | for group in self.param_groups: 7 | for p in group['params']: 8 | state = self.state[p] 9 | state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0 10 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 11 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 12 | 13 | def step(self, closure=None): 14 | for group in self.param_groups: 15 | for p in group['params']: 16 | if p.grad is None: continue 17 | self.state[p]['shared_steps'] += 1 18 | self.state[p]['step'] = self.state[p]['shared_steps'][0] - 1 # a "step += 1" comes later 19 | super.step(closure) 20 | -------------------------------------------------------------------------------- /parallel/1-Async-Q-Learning/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import QNet 11 | from worker import Worker 12 | from tensorboardX import SummaryWriter 13 | import torch.multiprocessing as mp 14 | from shared_adam import SharedAdam 15 | 16 | from config import env_name, lr, device 17 | 18 | 19 | def main(): 20 | env = gym.make(env_name) 21 | env.seed(500) 22 | torch.manual_seed(500) 23 | 24 | num_inputs = env.observation_space.shape[0] 25 | num_actions = env.action_space.n 26 | print('state size:', num_inputs) 27 | print('action size:', num_actions) 28 | 29 | online_net = QNet(num_inputs, num_actions) 30 | target_net = QNet(num_inputs, num_actions) 31 | target_net.load_state_dict(online_net.state_dict()) 32 | online_net.share_memory() 33 | target_net.share_memory() 34 | 35 | optimizer = SharedAdam(online_net.parameters(), lr=lr) 36 | global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() 37 | 38 | writer = SummaryWriter('logs') 39 | 40 | online_net.to(device) 41 | target_net.to(device) 42 | online_net.train() 43 | target_net.train() 44 | 45 | workers = [Worker(online_net, target_net, optimizer, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count())] 46 | [w.start() for w in workers] 47 | res = [] 48 | while True: 49 | r = res_queue.get() 50 | if r is not None: 51 | res.append(r) 52 | [ep, ep_r, loss] = r 53 | writer.add_scalar('log/score', float(ep_r), ep) 54 | writer.add_scalar('log/loss', float(loss), ep) 55 | else: 56 | break 57 | [w.join() for w in workers] 58 | 59 | 60 | if __name__=="__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /parallel/1-Async-Q-Learning/worker.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.multiprocessing as mp 4 | import numpy as np 5 | from model import QNet 6 | from memory import Memory 7 | 8 | from config import env_name, async_update_step, update_target, max_episode, device, log_interval, goal_score 9 | 10 | class Worker(mp.Process): 11 | def __init__(self, online_net, target_net, optimizer, global_ep, global_ep_r, res_queue, name): 12 | super(Worker, self).__init__() 13 | 14 | self.env = gym.make(env_name) 15 | self.env.seed(500) 16 | 17 | self.name = 'w%i' % name 18 | self.global_ep, self.global_ep_r, self.res_queue = global_ep, global_ep_r, res_queue 19 | self.online_net, self.target_net, self.optimizer = online_net, target_net, optimizer 20 | 21 | def record(self, score, epsilon, loss): 22 | with self.global_ep.get_lock(): 23 | self.global_ep.value += 1 24 | with self.global_ep_r.get_lock(): 25 | if self.global_ep_r.value == 0.: 26 | self.global_ep_r.value = score 27 | else: 28 | self.global_ep_r.value = 0.99 * self.global_ep_r.value + 0.01 * score 29 | if self.global_ep.value % log_interval == 0: 30 | print('{} , {} episode | score: {:.2f}, | epsilon: {:.2f}'.format( 31 | self.name, self.global_ep.value, self.global_ep_r.value, epsilon)) 32 | 33 | self.res_queue.put([self.global_ep.value, self.global_ep_r.value, loss]) 34 | 35 | 36 | def update_target_model(self): 37 | self.target_net.load_state_dict(self.online_net.state_dict()) 38 | 39 | def get_action(self, state, epsilon): 40 | if np.random.rand() <= epsilon: 41 | return self.env.action_space.sample() 42 | else: 43 | return self.target_net.get_action(state) 44 | 45 | def run(self): 46 | epsilon = 1.0 47 | steps = 0 48 | while self.global_ep.value < max_episode: 49 | if self.global_ep_r.value > goal_score: 50 | break 51 | done = False 52 | 53 | score = 0 54 | state = self.env.reset() 55 | state = torch.Tensor(state).to(device) 56 | state = state.unsqueeze(0) 57 | 58 | memory = Memory(async_update_step) 59 | 60 | while not done: 61 | steps += 1 62 | 63 | action = self.get_action(state, epsilon) 64 | next_state, reward, done, _ = self.env.step(action) 65 | 66 | next_state = torch.Tensor(next_state) 67 | next_state = next_state.unsqueeze(0) 68 | 69 | mask = 0 if done else 1 70 | reward = reward if not done or score == 499 else -1 71 | action_one_hot = np.zeros(2) 72 | action_one_hot[action] = 1 73 | memory.push(state, next_state, action_one_hot, reward, mask) 74 | 75 | score += reward 76 | state = next_state 77 | 78 | epsilon -= 0.00001 79 | epsilon = max(epsilon, 0.1) 80 | 81 | if len(memory) == async_update_step or done: 82 | batch = memory.sample() 83 | loss = QNet.train_model(self.online_net, self.target_net, self.optimizer, batch) 84 | memory = Memory(async_update_step) 85 | if done: 86 | self.record(score, epsilon, loss) 87 | break 88 | if steps % update_target == 0: 89 | self.update_target_model() 90 | 91 | score = score if score == 500.0 else score + 1 92 | 93 | self.res_queue.put(None) 94 | -------------------------------------------------------------------------------- /parallel/2-A3C/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lr = 0.0001 6 | goal_score = 200 7 | log_interval = 10 8 | n_step = 10 9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 10 | max_episode = 30000 11 | -------------------------------------------------------------------------------- /parallel/2-A3C/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple 3 | 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 5 | 6 | 7 | class Memory(object): 8 | def __init__(self, capacity): 9 | self.memory = [] 10 | self.capacity = capacity 11 | self.position = 0 12 | 13 | def push(self, state, next_state, action, reward, mask): 14 | """Saves a transition.""" 15 | if len(self.memory) < self.capacity: 16 | self.memory.append(Transition(state, next_state, action, reward, mask)) 17 | self.memory[self.position] = Transition(state, next_state, action, reward, mask) 18 | self.position = (self.position + 1) % self.capacity 19 | 20 | def sample(self): 21 | transitions = self.memory 22 | batch = Transition(*zip(*transitions)) 23 | return batch 24 | 25 | def __len__(self): 26 | return len(self.memory) 27 | -------------------------------------------------------------------------------- /parallel/2-A3C/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from config import gamma 6 | 7 | class Model(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | super(Model, self).__init__() 10 | self.num_inputs = num_inputs 11 | self.num_outputs = num_outputs 12 | 13 | self.fc = nn.Linear(num_inputs, 128) 14 | self.fc_actor = nn.Linear(128, num_outputs) 15 | self.fc_critic = nn.Linear(128, 1) 16 | 17 | for m in self.modules(): 18 | if isinstance(m, nn.Linear): 19 | nn.init.xavier_uniform(m.weight) 20 | 21 | def forward(self, input): 22 | x = F.relu(self.fc(input)) 23 | policy = F.softmax(self.fc_actor(x)) 24 | value = self.fc_critic(x) 25 | return policy, value 26 | 27 | def get_action(self, input): 28 | policy, _ = self.forward(input) 29 | policy = policy[0].data.numpy() 30 | 31 | action = np.random.choice(self.num_outputs, 1, p=policy)[0] 32 | return action 33 | 34 | 35 | class GlobalModel(Model): 36 | def __init__(self, num_inputs, num_outputs): 37 | super(GlobalModel, self).__init__(num_inputs, num_outputs) 38 | 39 | 40 | class LocalModel(Model): 41 | def __init__(self, num_inputs, num_outputs): 42 | super(LocalModel, self).__init__(num_inputs, num_outputs) 43 | 44 | def push_to_global_model(self, batch, global_model, global_optimizer): 45 | states = torch.stack(batch.state) 46 | next_states = torch.stack(batch.next_state) 47 | actions = torch.stack(batch.action) 48 | rewards = torch.Tensor(batch.reward) 49 | masks = torch.Tensor(batch.mask) 50 | 51 | policy, value = self.forward(states) 52 | policy = policy.view(-1, self.num_outputs) 53 | value = value.view(-1) 54 | 55 | _, last_value = self.forward(next_states[-1]) 56 | 57 | running_return = last_value[0].data 58 | running_returns = torch.zeros(rewards.size()) 59 | for t in reversed(range(0, len(rewards))): 60 | running_return = rewards[t] + gamma * running_return * masks[t] 61 | running_returns[t] = running_return 62 | 63 | 64 | td_error = running_returns - value.detach() 65 | log_policy = (torch.log(policy + 1e-10) * actions).sum(dim=1, keepdim=True) 66 | loss_policy = - log_policy * td_error 67 | loss_value = torch.pow(td_error, 2) 68 | entropy = (torch.log(policy + 1e-10) * policy).sum(dim=1, keepdim=True) 69 | 70 | loss = (loss_policy + loss_value - 0.01 * entropy).mean() 71 | 72 | global_optimizer.zero_grad() 73 | loss.backward() 74 | for lp, gp in zip(self.parameters(), global_model.parameters()): 75 | gp._grad = lp.grad 76 | global_optimizer.step() 77 | 78 | return loss 79 | 80 | def pull_from_global_model(self, global_model): 81 | self.load_state_dict(global_model.state_dict()) 82 | -------------------------------------------------------------------------------- /parallel/2-A3C/shared_adam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | class SharedAdam(torch.optim.Adam): # extend a pytorch optimizer so it shares grads across processes 3 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 4 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 5 | for group in self.param_groups: 6 | for p in group['params']: 7 | state = self.state[p] 8 | state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0 9 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 10 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 11 | 12 | def step(self, closure=None): 13 | for group in self.param_groups: 14 | for p in group['params']: 15 | if p.grad is None: continue 16 | self.state[p]['shared_steps'] += 1 17 | self.state[p]['step'] = self.state[p]['shared_steps'][0] - 1 # a "step += 1" comes later 18 | super.step(closure) 19 | 20 | # class SharedAdam(torch.optim.Adam): 21 | # def __init__(self, params, lr=1e-3, betas=(0.9, 0.9), eps=1e-8, 22 | # weight_decay=0): 23 | # super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 24 | # # State initialization 25 | # for group in self.param_groups: 26 | # for p in group['params']: 27 | # state = self.state[p] 28 | # state['step'] = 0 29 | # state['exp_avg'] = torch.zeros_like(p.data) 30 | # state['exp_avg_sq'] = torch.zeros_like(p.data) 31 | # 32 | # # share in memory 33 | # state['exp_avg'].share_memory_() 34 | # state['exp_avg_sq'].share_memory_() 35 | -------------------------------------------------------------------------------- /parallel/2-A3C/train.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | 4 | from model import Model 5 | from worker import Worker 6 | from shared_adam import SharedAdam 7 | from tensorboardX import SummaryWriter 8 | import torch.multiprocessing as mp 9 | 10 | from config import env_name, lr 11 | 12 | def main(): 13 | env = gym.make(env_name) 14 | env.seed(500) 15 | torch.manual_seed(500) 16 | 17 | num_inputs = env.observation_space.shape[0] 18 | num_actions = env.action_space.n 19 | global_model = Model(num_inputs, num_actions) 20 | global_model.share_memory() 21 | global_optimizer = SharedAdam(global_model.parameters(), lr=lr) 22 | global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() 23 | 24 | writer = SummaryWriter('logs') 25 | 26 | workers = [Worker(global_model, global_optimizer, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count())] 27 | [w.start() for w in workers] 28 | res = [] 29 | while True: 30 | r = res_queue.get() 31 | if r is not None: 32 | res.append(r) 33 | [ep, ep_r, loss] = r 34 | writer.add_scalar('log/score', float(ep_r), ep) 35 | writer.add_scalar('log/loss', float(loss), ep) 36 | else: 37 | break 38 | [w.join() for w in workers] 39 | 40 | if __name__=="__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /parallel/2-A3C/worker.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.multiprocessing as mp 4 | import numpy as np 5 | from model import LocalModel 6 | from memory import Memory 7 | from config import env_name, n_step, max_episode, log_interval 8 | 9 | class Worker(mp.Process): 10 | def __init__(self, global_model, global_optimizer, global_ep, global_ep_r, res_queue, name): 11 | super(Worker, self).__init__() 12 | 13 | self.env = gym.make(env_name) 14 | self.env.seed(500) 15 | 16 | self.name = 'w%i' % name 17 | self.global_ep, self.global_ep_r, self.res_queue = global_ep, global_ep_r, res_queue 18 | self.global_model, self.global_optimizer = global_model, global_optimizer 19 | self.local_model = LocalModel(self.env.observation_space.shape[0], self.env.action_space.n) 20 | self.num_actions = self.env.action_space.n 21 | 22 | def record(self, score, loss): 23 | with self.global_ep.get_lock(): 24 | self.global_ep.value += 1 25 | with self.global_ep_r.get_lock(): 26 | if self.global_ep_r.value == 0.: 27 | self.global_ep_r.value = score 28 | else: 29 | self.global_ep_r.value = 0.99 * self.global_ep_r.value + 0.01 * score 30 | if self.global_ep.value % log_interval == 0: 31 | print('{} , {} episode | score: {:.2f}'.format( 32 | self.name, self.global_ep.value, self.global_ep_r.value)) 33 | 34 | self.res_queue.put([self.global_ep.value, self.global_ep_r.value, loss]) 35 | 36 | def get_action(self, policy, num_actions): 37 | policy = policy.data.numpy()[0] 38 | action = np.random.choice(num_actions, 1, p=policy)[0] 39 | return action 40 | 41 | def run(self): 42 | 43 | while self.global_ep.value < max_episode: 44 | self.local_model.pull_from_global_model(self.global_model) 45 | done = False 46 | score = 0 47 | steps = 0 48 | 49 | state = self.env.reset() 50 | state = torch.Tensor(state) 51 | state = state.unsqueeze(0) 52 | memory = Memory(n_step) 53 | 54 | while True: 55 | policy, value = self.local_model(state) 56 | action = self.get_action(policy, self.num_actions) 57 | 58 | next_state, reward, done, _ = self.env.step(action) 59 | next_state = torch.Tensor(next_state) 60 | next_state = next_state.unsqueeze(0) 61 | 62 | mask = 0 if done else 1 63 | reward = reward if not done or score == 499 else -1 64 | action_one_hot = torch.zeros(2) 65 | action_one_hot[action] = 1 66 | memory.push(state, next_state, action_one_hot, reward, mask) 67 | 68 | score += reward 69 | state = next_state 70 | 71 | if len(memory) == n_step or done: 72 | batch = memory.sample() 73 | loss = self.local_model.push_to_global_model(batch, self.global_model, self.global_optimizer) 74 | self.local_model.pull_from_global_model(self.global_model) 75 | memory = Memory(n_step) 76 | 77 | if done: 78 | running_score = self.record(score, loss) 79 | break 80 | 81 | 82 | self.res_queue.put(None) 83 | -------------------------------------------------------------------------------- /parallel/3-ACER/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lr = 0.001 6 | goal_score = 200 7 | log_interval = 10 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | max_episode = 30000 10 | 11 | 12 | replay_memory_capacity = 1000 13 | truncation_clip = 10 14 | delta = 1 15 | trust_region_decay = 0.99 16 | replay_ratio = 4 17 | max_gradient_norm = 40 18 | -------------------------------------------------------------------------------- /parallel/3-ACER/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask', 'policy')) 5 | 6 | 7 | class Memory(object): 8 | def __init__(self, capacity): 9 | self.memory = deque(maxlen=capacity) 10 | self.capacity = capacity 11 | 12 | def push(self, trajectory): 13 | self.memory.append(trajectory.trajectory) 14 | 15 | def sample(self): 16 | trajectory = self.memory[random.randrange(len(self.memory))] 17 | return Transition(*zip(*trajectory)) 18 | 19 | def __len__(self): 20 | return len(self.memory) 21 | 22 | class Trajectory(object): 23 | def __init__(self): 24 | self.trajectory = [] 25 | 26 | def push(self, state, next_state, action, reward, mask, policy): 27 | self.trajectory.append(Transition(state, next_state, action, reward, mask, policy)) 28 | 29 | def sample(self): 30 | trajectory = self.trajectory 31 | return Transition(*zip(*trajectory)) 32 | 33 | def __len__(self): 34 | return len(self.trajectory) 35 | -------------------------------------------------------------------------------- /parallel/3-ACER/shared_adam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | class SharedAdam(torch.optim.Adam): # extend a pytorch optimizer so it shares grads across processes 3 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 4 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 5 | for group in self.param_groups: 6 | for p in group['params']: 7 | state = self.state[p] 8 | state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0 9 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 10 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 11 | 12 | def step(self, closure=None): 13 | for group in self.param_groups: 14 | for p in group['params']: 15 | if p.grad is None: continue 16 | self.state[p]['shared_steps'] += 1 17 | self.state[p]['step'] = self.state[p]['shared_steps'][0] - 1 # a "step += 1" comes later 18 | super.step(closure) 19 | 20 | # class SharedAdam(torch.optim.Adam): 21 | # def __init__(self, params, lr=1e-3, betas=(0.9, 0.9), eps=1e-8, 22 | # weight_decay=0): 23 | # super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 24 | # # State initialization 25 | # for group in self.param_groups: 26 | # for p in group['params']: 27 | # state = self.state[p] 28 | # state['step'] = 0 29 | # state['exp_avg'] = torch.zeros_like(p.data) 30 | # state['exp_avg_sq'] = torch.zeros_like(p.data) 31 | # 32 | # # share in memory 33 | # state['exp_avg'].share_memory_() 34 | # state['exp_avg_sq'].share_memory_() 35 | -------------------------------------------------------------------------------- /parallel/3-ACER/train.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | 4 | from model import Model 5 | from worker import Worker 6 | from shared_adam import SharedAdam 7 | from tensorboardX import SummaryWriter 8 | import torch.multiprocessing as mp 9 | 10 | from config import env_name, lr 11 | 12 | def main(): 13 | env = gym.make(env_name) 14 | env.seed(500) 15 | torch.manual_seed(500) 16 | 17 | num_inputs = env.observation_space.shape[0] 18 | num_actions = env.action_space.n 19 | env.close() 20 | 21 | global_model = Model(num_inputs, num_actions) 22 | global_average_model = Model(num_inputs, num_actions) 23 | global_model.share_memory() 24 | global_average_model.share_memory() 25 | global_optimizer = SharedAdam(global_model.parameters(), lr=lr) 26 | global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() 27 | 28 | writer = SummaryWriter('logs') 29 | 30 | n = mp.cpu_count() 31 | workers = [Worker(global_model, global_average_model, global_optimizer, global_ep, global_ep_r, res_queue, i) for i in range(n)] 32 | [w.start() for w in workers] 33 | res = [] 34 | while True: 35 | r = res_queue.get() 36 | if r is not None: 37 | res.append(r) 38 | [ep, ep_r, loss] = r 39 | writer.add_scalar('log/score', float(ep_r), ep) 40 | writer.add_scalar('log/loss', float(loss), ep) 41 | else: 42 | break 43 | [w.join() for w in workers] 44 | 45 | if __name__=="__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /parallel/3-ACER/worker.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.multiprocessing as mp 4 | import numpy as np 5 | from model import LocalModel 6 | from memory import Memory, Trajectory 7 | from config import env_name, max_episode, log_interval, replay_memory_capacity, replay_ratio 8 | 9 | class Worker(mp.Process): 10 | def __init__(self, global_model, global_average_model, global_optimizer, global_ep, global_ep_r, res_queue, name): 11 | super(Worker, self).__init__() 12 | 13 | self.env = gym.make(env_name) 14 | self.env.seed(500) 15 | 16 | self.name = 'w%i' % name 17 | self.global_ep, self.global_ep_r, self.res_queue = global_ep, global_ep_r, res_queue 18 | self.global_model, self.global_average_model, self.global_optimizer = global_model, global_average_model, global_optimizer 19 | self.local_model = LocalModel(self.env.observation_space.shape[0], self.env.action_space.n) 20 | self.num_actions = self.env.action_space.n 21 | 22 | self.memory = Memory(replay_memory_capacity) 23 | 24 | def record(self, score, loss): 25 | with self.global_ep.get_lock(): 26 | self.global_ep.value += 1 27 | with self.global_ep_r.get_lock(): 28 | if self.global_ep_r.value == 0.: 29 | self.global_ep_r.value = score 30 | else: 31 | self.global_ep_r.value = 0.99 * self.global_ep_r.value + 0.01 * score 32 | if self.global_ep.value % log_interval == 0: 33 | print('{} , {} episode | score: {:.2f}'.format( 34 | self.name, self.global_ep.value, self.global_ep_r.value)) 35 | 36 | self.res_queue.put([self.global_ep.value, self.global_ep_r.value, loss]) 37 | 38 | def run(self): 39 | while self.global_ep.value < max_episode: 40 | self.algorithm(True) 41 | n = np.random.poisson(replay_ratio) 42 | for _ in range(n): 43 | self.algorithm(False) 44 | 45 | def algorithm(self, on_policy): 46 | self.local_model.pull_from_global_model(self.global_model) 47 | if not on_policy and len(self.memory) > 100: 48 | trajectory = self.memory.sample() 49 | else: 50 | trajectory, score = self.run_env() 51 | loss = self.local_model.train(on_policy, trajectory, self.global_average_model, self.global_optimizer, self.global_model, self.global_average_model) 52 | if on_policy: 53 | self.record(score, loss) 54 | 55 | 56 | def run_env(self): 57 | done = False 58 | score = 0 59 | steps = 0 60 | 61 | state = self.env.reset() 62 | state = torch.Tensor(state) 63 | state = state.unsqueeze(0) 64 | trajectory = Trajectory() 65 | 66 | while True: 67 | action, policy = self.local_model.get_action(state) 68 | policy = torch.Tensor(policy) 69 | 70 | next_state, reward, done, _ = self.env.step(action) 71 | next_state = torch.Tensor(next_state) 72 | next_state = next_state.unsqueeze(0) 73 | 74 | mask = 0 if done else 1 75 | reward = reward if not done or score == 499 else -1 76 | trajectory.push(state, next_state, action, reward, mask, policy) 77 | 78 | score += reward 79 | state = next_state 80 | 81 | if done: 82 | break 83 | 84 | self.memory.push(trajectory) 85 | trajectory = trajectory.sample() 86 | return trajectory, score 87 | -------------------------------------------------------------------------------- /parallel/5-ApeX/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | lr = 0.002 6 | goal_score = 200 7 | log_interval = 10 8 | max_episode = 30000 9 | 10 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 11 | 12 | 13 | 14 | replay_memory_capacity = 10000 15 | n_step = 3 16 | local_mini_batch = 32 17 | batch_size = 32 18 | alpha = 0.5 19 | beta = 0.4 -------------------------------------------------------------------------------- /parallel/5-ApeX/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | from collections import namedtuple, deque 5 | 6 | from config import gamma, batch_size, alpha, beta 7 | 8 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask', 'step')) 9 | 10 | class N_Step_Buffer(object): 11 | def __init__(self): 12 | self.memory = [] 13 | self.step = 0 14 | 15 | def push(self, state, next_state, action, reward, mask): 16 | self.step += 1 17 | self.memory.append([state, next_state, action, reward, mask]) 18 | 19 | def sample(self): 20 | [state, _, action, _, _] = self.memory[0] 21 | [_, next_state, _, _, mask] = self.memory[-1] 22 | 23 | sum_reward = 0 24 | for t in reversed(range(len(self.memory))): 25 | [_, _, _, reward, _] = self.memory[t] 26 | sum_reward += reward + gamma * sum_reward 27 | reward = sum_reward 28 | step = self.step 29 | self.reset() 30 | 31 | return [state, next_state, action, reward, mask, step] 32 | 33 | def reset(self): 34 | self.memory = [] 35 | self.step = 0 36 | 37 | def __len__(self): 38 | return len(self.memory) 39 | 40 | 41 | class LocalBuffer(object): 42 | def __init__(self): 43 | self.memory = [] 44 | 45 | def push(self, state, next_state, action, reward, mask, step): 46 | self.memory.append(Transition(state, next_state, action, reward, mask, step)) 47 | 48 | def sample(self): 49 | transitions = self.memory 50 | batch = Transition(*zip(*transitions)) 51 | return batch 52 | 53 | def reset(self): 54 | self.memory = [] 55 | 56 | def __len__(self): 57 | return len(self.memory) 58 | 59 | class Memory(object): 60 | def __init__(self, capacity): 61 | self.capacity = capacity 62 | self.memory = deque(maxlen=capacity) 63 | self.memory_probability = deque(maxlen=capacity) 64 | 65 | def push(self, state, next_state, action, reward, mask, step, prior): 66 | self.memory.append(Transition(state, next_state, action, reward, mask, step)) 67 | self.memory_probability.append(prior) 68 | 69 | def sample(self): 70 | probaility = torch.Tensor(self.memory_probability) 71 | probaility = probaility.pow(alpha) 72 | probaility = probaility / probaility.sum() 73 | 74 | p = probaility.numpy() 75 | 76 | indexes = np.random.choice(range(len(self.memory_probability)), batch_size, p=p) 77 | 78 | transitions = [self.memory[idx] for idx in indexes] 79 | transitions_p = torch.Tensor([self.memory_probability[idx] for idx in indexes]) 80 | 81 | batch = Transition(*zip(*transitions)) 82 | 83 | weights = (self.capacity * transitions_p).pow(-beta) 84 | weights = weights / weights.max() 85 | 86 | return indexes, batch, weights 87 | 88 | def update_prior(self, indexes, priors): 89 | priors_idx = 0 90 | for idx in indexes: 91 | self.memory_probability[idx] = priors[priors_idx] 92 | priors_idx += 1 93 | 94 | def __len__(self): 95 | return len(self.memory) 96 | 97 | 98 | -------------------------------------------------------------------------------- /parallel/5-ApeX/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class Model(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(Model, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc = nn.Linear(num_inputs, 128) 13 | self.fc_adv = nn.Linear(128, num_outputs) 14 | self.fc_val = nn.Linear(128, 1) 15 | 16 | for m in self.modules(): 17 | if isinstance(m, nn.Linear): 18 | nn.init.xavier_uniform(m.weight) 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc(x)) 22 | adv = self.fc_adv(x) 23 | adv = adv.view(-1, self.num_outputs) 24 | val = self.fc_val(x) 25 | val = val.view(-1, 1) 26 | 27 | qvalue = val + (adv - adv.mean(dim=1, keepdim=True)) 28 | return qvalue 29 | 30 | class LocalModel(Model): 31 | def __init__(self, num_inputs, num_outputs): 32 | super(LocalModel, self).__init__(num_inputs, num_outputs) 33 | 34 | def pull_from_global_model(self, global_model): 35 | self.load_state_dict(global_model.state_dict()) 36 | 37 | def get_action(self, input): 38 | qvalue = self.forward(input) 39 | _, action = torch.max(qvalue, 1) 40 | return action.numpy()[0] -------------------------------------------------------------------------------- /parallel/5-ApeX/train.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | 4 | from model import Model 5 | from worker import Actor, Learner 6 | import torch.multiprocessing as mp 7 | from tensorboardX import SummaryWriter 8 | 9 | from memory import Memory 10 | from config import env_name, lr, replay_memory_capacity 11 | 12 | def main(): 13 | env = gym.make(env_name) 14 | env.seed(500) 15 | torch.manual_seed(500) 16 | 17 | num_inputs = env.observation_space.shape[0] 18 | num_actions = env.action_space.n 19 | env.close() 20 | 21 | global_target_model = Model(num_inputs, num_actions) 22 | global_online_model = Model(num_inputs, num_actions) 23 | global_target_model.train() 24 | global_online_model.train() 25 | 26 | global_target_model.load_state_dict(global_online_model.state_dict()) 27 | global_target_model.share_memory() 28 | global_online_model.share_memory() 29 | 30 | global_memory = Memory(replay_memory_capacity) 31 | 32 | 33 | global_ep, global_ep_r, res_queue, global_memory_pipe = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue(), mp.Queue() 34 | 35 | writer = SummaryWriter('logs') 36 | 37 | n = 2 38 | epsilons = [(i * 0.05 + 0.1) for i in range(n)] 39 | 40 | actors = [Actor(global_target_model, global_memory_pipe, global_ep, global_ep_r, epsilons[i], i) for i in range(n)] 41 | [w.start() for w in actors] 42 | learner = Learner(global_online_model, global_target_model, global_memory, global_memory_pipe, res_queue) 43 | learner.start() 44 | 45 | res = [] 46 | while True: 47 | r = res_queue.get() 48 | if r is not None: 49 | res.append(r) 50 | [ep, loss] = r 51 | # writer.add_scalar('log/score', float(ep_r), ep) 52 | writer.add_scalar('log/loss', float(loss), ep) 53 | else: 54 | break 55 | [w.join() for w in actors] 56 | 57 | if __name__=="__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /rainbow/1-dqn/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | -------------------------------------------------------------------------------- /rainbow/1-dqn/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 5 | 6 | 7 | class Memory(object): 8 | def __init__(self, capacity): 9 | self.memory = deque(maxlen=capacity) 10 | self.capacity = capacity 11 | 12 | def push(self, state, next_state, action, reward, mask): 13 | self.memory.append(Transition(state, next_state, action, reward, mask)) 14 | 15 | def sample(self, batch_size): 16 | transitions = random.sample(self.memory, batch_size) 17 | batch = Transition(*zip(*transitions)) 18 | return batch 19 | 20 | def __len__(self): 21 | return len(self.memory) 22 | -------------------------------------------------------------------------------- /rainbow/1-dqn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from config import gamma 6 | class QNet(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(QNet, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc1 = nn.Linear(num_inputs, 128) 13 | self.fc2 = nn.Linear(128, num_outputs) 14 | 15 | for m in self.modules(): 16 | if isinstance(m, nn.Linear): 17 | nn.init.xavier_uniform(m.weight) 18 | 19 | def forward(self, x): 20 | x = F.relu(self.fc1(x)) 21 | qvalue = self.fc2(x) 22 | return qvalue 23 | 24 | @classmethod 25 | def train_model(cls, online_net, target_net, optimizer, batch): 26 | states = torch.stack(batch.state) 27 | next_states = torch.stack(batch.next_state) 28 | actions = torch.Tensor(batch.action).float() 29 | rewards = torch.Tensor(batch.reward) 30 | masks = torch.Tensor(batch.mask) 31 | 32 | pred = online_net(states).squeeze(1) 33 | next_pred = target_net(next_states).squeeze(1) 34 | 35 | pred = torch.sum(pred.mul(actions), dim=1) 36 | 37 | target = rewards + masks * gamma * next_pred.max(1)[0] 38 | 39 | loss = F.mse_loss(pred, target.detach()) 40 | optimizer.zero_grad() 41 | loss.backward() 42 | optimizer.step() 43 | 44 | return loss 45 | 46 | def get_action(self, input): 47 | qvalue = self.forward(input) 48 | _, action = torch.max(qvalue, 1) 49 | return action.numpy()[0] 50 | -------------------------------------------------------------------------------- /rainbow/1-dqn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import QNet 11 | from memory import Memory 12 | from tensorboardX import SummaryWriter 13 | 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr 15 | 16 | 17 | def get_action(state, target_net, epsilon, env): 18 | if np.random.rand() <= epsilon: 19 | return env.action_space.sample() 20 | else: 21 | return target_net.get_action(state) 22 | 23 | def update_target_model(online_net, target_net): 24 | # Target <- Net 25 | target_net.load_state_dict(online_net.state_dict()) 26 | 27 | 28 | def main(): 29 | env = gym.make(env_name) 30 | env.seed(500) 31 | torch.manual_seed(500) 32 | 33 | num_inputs = env.observation_space.shape[0] 34 | num_actions = env.action_space.n 35 | print('state size:', num_inputs) 36 | print('action size:', num_actions) 37 | 38 | online_net = QNet(num_inputs, num_actions) 39 | target_net = QNet(num_inputs, num_actions) 40 | update_target_model(online_net, target_net) 41 | 42 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 43 | writer = SummaryWriter('logs') 44 | 45 | online_net.to(device) 46 | target_net.to(device) 47 | online_net.train() 48 | target_net.train() 49 | memory = Memory(replay_memory_capacity) 50 | running_score = 0 51 | epsilon = 1.0 52 | steps = 0 53 | loss = 0 54 | 55 | for e in range(3000): 56 | done = False 57 | 58 | score = 0 59 | state = env.reset() 60 | state = torch.Tensor(state).to(device) 61 | state = state.unsqueeze(0) 62 | 63 | while not done: 64 | steps += 1 65 | 66 | action = get_action(state, target_net, epsilon, env) 67 | next_state, reward, done, _ = env.step(action) 68 | 69 | next_state = torch.Tensor(next_state) 70 | next_state = next_state.unsqueeze(0) 71 | 72 | mask = 0 if done else 1 73 | reward = reward if not done or score == 499 else -1 74 | action_one_hot = np.zeros(2) 75 | action_one_hot[action] = 1 76 | memory.push(state, next_state, action_one_hot, reward, mask) 77 | 78 | score += reward 79 | state = next_state 80 | 81 | if steps > initial_exploration: 82 | epsilon -= 0.00005 83 | epsilon = max(epsilon, 0.1) 84 | 85 | batch = memory.sample(batch_size) 86 | loss = QNet.train_model(online_net, target_net, optimizer, batch) 87 | 88 | if steps % update_target == 0: 89 | update_target_model(online_net, target_net) 90 | 91 | score = score if score == 500.0 else score + 1 92 | running_score = 0.99 * running_score + 0.01 * score 93 | if e % log_interval == 0: 94 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 95 | e, running_score, epsilon)) 96 | writer.add_scalar('log/score', float(running_score), e) 97 | writer.add_scalar('log/loss', float(loss), e) 98 | 99 | if running_score > goal_score: 100 | break 101 | 102 | 103 | if __name__=="__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /rainbow/2-DoubleDQN/README-KR.md: -------------------------------------------------------------------------------- 1 | # Double DQN 2 | 3 | Last Edited: Nov 19, 2018 6:06 PM 4 | Tags: RL 5 | 6 | ## 논문 7 | 8 | double: [https://arxiv.org/pdf/1509.06461.pdf](https://arxiv.org/pdf/1509.06461.pdf) 9 | 10 | duel: [https://arxiv.org/pdf/1511.06581.pdf](https://arxiv.org/pdf/1511.06581.pdf) 11 | 12 | ## Double 13 | 14 | 그냥 DQN 식 15 | 16 | $$loss = (Q(s,a) - r + \gamma Q'(s, argmax_{a'}Q'(s,a'))^2$$ 17 | 18 | Double DQN 식 19 | 20 | $$loss = (Q(s,a) - r + \gamma Q'(s, argma_{a'}Q(s,a'))^2$$ 21 | 22 | Action 선택을 `target_net` 으로 하는지 `main_net` 으로 하는지의 차이만 있을 뿐이다. 23 | 24 | DQN에서는 단순하게 `target_net`으로 Action을 선택했는데 이 경우에는 만약 `target_net`이 가장 큰 `qvalue`을 가지고 있는 `action`을 선택하면 그 `action`이 다시 `Q-value`을 증가 시키고 다시 그 `action`이 선택 되는 순환이 발생 할 수 있기 때문에 `action` 을 선택하는 `net` 과 `value` 을 평가하는 `net` 을 분리시킨다. 25 | 26 | ## 구현 27 | 28 | ```python 29 | def train_model(cls, net, target_net, optimizer, batch, batch_size): 30 | states = torch.stack(batch.state) 31 | next_states = torch.stack(batch.next_state) 32 | actions = torch.Tensor(batch.action).float() 33 | rewards = torch.Tensor(batch.reward) 34 | masks = torch.Tensor(batch.mask) 35 | 36 | pred = net(states).squeeze(1) 37 | _, action_from_net = net(next_states).squeeze(1).max(1) 38 | next_pred = target_net(next_states).squeeze(1) 39 | 40 | pred = torch.sum(pred.mul(actions), dim=1) 41 | 42 | target = rewards + masks * gamma * next_pred.gather(1, action_from_net.unsqueeze(1)).squeeze(1) 43 | ``` 44 | 45 | 46 | ​ 47 | ```python 48 | loss = F.mse_loss(pred, target.detach()) 49 | optimizer.zero_grad() 50 | loss.backward() 51 | optimizer.step() 52 | ``` 53 | 54 | 일단 `action` 을 `net` 의 `max` 로 구한다. 그 뒤 `target_net` 에서 그 `action` 값에 맞는 `Q-value`을 사용한다. -------------------------------------------------------------------------------- /rainbow/2-DoubleDQN/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | -------------------------------------------------------------------------------- /rainbow/2-DoubleDQN/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple 3 | 4 | 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 6 | 7 | 8 | class Memory(object): 9 | def __init__(self, capacity): 10 | self.memory = [] 11 | self.capacity = capacity 12 | self.position = 0 13 | 14 | def push(self, state, next_state, action, reward, mask): 15 | """Saves a transition.""" 16 | if len(self.memory) < self.capacity: 17 | self.memory.append(Transition(state, next_state, action, reward, mask)) 18 | self.memory[self.position] = Transition(state, next_state, action, reward, mask) 19 | self.position = (self.position + 1) % self.capacity 20 | 21 | def sample(self, batch_size): 22 | transitions = random.sample(self.memory, batch_size) 23 | batch = Transition(*zip(*transitions)) 24 | return batch 25 | 26 | def __len__(self): 27 | return len(self.memory) 28 | -------------------------------------------------------------------------------- /rainbow/2-DoubleDQN/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from config import gamma 5 | 6 | class DoubleDQNet(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(DoubleDQNet, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc1 = nn.Linear(num_inputs, 128) 13 | self.fc2 = nn.Linear(128, num_outputs) 14 | 15 | for m in self.modules(): 16 | if isinstance(m, nn.Linear): 17 | nn.init.xavier_uniform(m.weight) 18 | 19 | def forward(self, x): 20 | x = F.relu(self.fc1(x)) 21 | qvalue = self.fc2(x) 22 | return qvalue 23 | 24 | @classmethod 25 | def train_model(cls, online_net, target_net, optimizer, batch): 26 | states = torch.stack(batch.state) 27 | next_states = torch.stack(batch.next_state) 28 | actions = torch.Tensor(batch.action).float() 29 | rewards = torch.Tensor(batch.reward) 30 | masks = torch.Tensor(batch.mask) 31 | 32 | pred = online_net(states).squeeze(1) 33 | _, action_from_online_net = online_net(next_states).squeeze(1).max(1) 34 | next_pred = target_net(next_states).squeeze(1) 35 | 36 | pred = torch.sum(pred.mul(actions), dim=1) 37 | 38 | target = rewards + masks * gamma * next_pred.gather(1, action_from_online_net.unsqueeze(1)).squeeze(1) 39 | 40 | 41 | loss = F.mse_loss(pred, target.detach()) 42 | optimizer.zero_grad() 43 | loss.backward() 44 | optimizer.step() 45 | 46 | return loss 47 | 48 | def get_action(self, input): 49 | qvalue = self.forward(input) 50 | _, action = torch.max(qvalue, 1) 51 | return action.numpy()[0] 52 | -------------------------------------------------------------------------------- /rainbow/2-DoubleDQN/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from model import DoubleDQNet 12 | from memory import Memory 13 | from tensorboardX import SummaryWriter 14 | from config import env_name, gamma, initial_exploration, batch_size, update_target, log_interval, goal_score, device, replay_memory_capacity, lr 15 | 16 | 17 | def get_action(state, target_net, epsilon, env): 18 | if np.random.rand() <= epsilon: 19 | return env.action_space.sample() 20 | else: 21 | return target_net.get_action(state) 22 | 23 | 24 | def update_target_model(online_net, target_net): 25 | # Target -> Net 26 | target_net.load_state_dict(online_net.state_dict()) 27 | 28 | 29 | def main(): 30 | env = gym.make(env_name) 31 | env.seed(500) 32 | torch.manual_seed(500) 33 | 34 | num_inputs = env.observation_space.shape[0] 35 | num_actions = env.action_space.n 36 | print('state size:', num_inputs) 37 | print('action size:', num_actions) 38 | 39 | online_net = DoubleDQNet(num_inputs, num_actions) 40 | target_net = DoubleDQNet(num_inputs, num_actions) 41 | update_target_model(online_net, target_net) 42 | 43 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 44 | writer = SummaryWriter('logs') 45 | 46 | online_net.to(device) 47 | target_net.to(device) 48 | online_net.train() 49 | target_net.train() 50 | memory = Memory(replay_memory_capacity) 51 | running_score = 0 52 | epsilon = 1.0 53 | steps = 0 54 | loss = 0 55 | 56 | for e in range(3000): 57 | done = False 58 | 59 | score = 0 60 | state = env.reset() 61 | state = torch.Tensor(state).to(device) 62 | state = state.unsqueeze(0) 63 | 64 | while not done: 65 | steps += 1 66 | action = get_action(state, target_net, epsilon, env) 67 | next_state, reward, done, _ = env.step(action) 68 | 69 | next_state = torch.Tensor(next_state) 70 | next_state = next_state.unsqueeze(0) 71 | 72 | mask = 0 if done else 1 73 | reward = reward if not done or score == 499 else -1 74 | action_one_hot = np.zeros(2) 75 | action_one_hot[action] = 1 76 | memory.push(state, next_state, action_one_hot, reward, mask) 77 | 78 | score += reward 79 | state = next_state 80 | 81 | if steps > initial_exploration: 82 | epsilon -= 0.00005 83 | epsilon = max(epsilon, 0.1) 84 | 85 | batch = memory.sample(batch_size) 86 | loss = DoubleDQNet.train_model(online_net, target_net, optimizer, batch) 87 | 88 | if steps % update_target == 0: 89 | update_target_model(online_net, target_net) 90 | 91 | score = score if score == 500.0 else score + 1 92 | running_score = 0.99 * running_score + 0.01 * score 93 | if e % log_interval == 0: 94 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 95 | e, running_score, epsilon)) 96 | writer.add_scalar('log/score', float(running_score), e) 97 | writer.add_scalar('log/loss', float(loss), e) 98 | 99 | if running_score > goal_score: 100 | break 101 | 102 | 103 | if __name__=="__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /rainbow/3-DuelDQN/README-KR.md: -------------------------------------------------------------------------------- 1 | # Duel DQN 2 | 3 | Last Edited: Nov 19, 2018 6:06 PM 4 | Tags: RL 5 | 6 | ## Duel 7 | 8 | $$Q(s,a) = V(s) + A(s,a)$$ 9 | 10 | `Q` 함수를 `V`와 `A`로 분리한다. `A`는 즉 현재의 상태에 대한 기댓값과 행동에 대한 기댓값을 분리한다. 이로써 어떠한 행동에 대한 기댓값을 좀더 잘 추측 할 수 있다. 11 | 12 | ![](Screenshot2018-11-1519-f88e4bf3-f581-4b24-a07e-af467a4bba64.14.23.png) 13 | 14 | 구하는 방법은 단순하게 layer 을 하나 더 만들어서 `A`을 output으로 하게 하면된다. 15 | 16 | 하지만 단순하게 위의 식대로 할 경우 논문에서는 하나의 `Q` 에 대해서 `V`와 `S`가 unique 하지 않게 되고 그것 때문에 성능이 안 좋아진다고 한다. 그래서 17 | 18 | $$Q(s,a) = V(s) + (A(s,a) - max_{a'}A(s,a'))$$ 19 | 20 | 로 식을 바꾼다. 이로써 Q 가 수렴 되었을 때는 항상 가장 좋은 액션만 선택한다고 되었을 때이고 뒤의 `A` 항이 전부다 0이 되기 때문에 하나의 Q 값에 대해서 V와 A에 대한 항 부분을 unique 하게 찾을 수 있다. 21 | 22 | $$Q(s,a) = V(s) + (A(s,a) - Avg(A))$$ 23 | 24 | 그리고 다시 식을 위처럼 바꾸는데, 위의 식처럼 할 경우 `A`항의 부분이 0이 되는게 더 빨리 수렴이 되기 때문에 더 빠르게 학습이 된다고 한다. 위의 경우는 max 로 수렴, 밑의 부분은 평균으로 수렴. 25 | 26 | ```python 27 | class DuelDQNet(nn.Module): 28 | def __init__(self, num_inputs, num_outputs): 29 | super(DuelDQNet, self).__init__() 30 | self.num_inputs = num_inputs 31 | self.num_outputs = num_outputs 32 | 33 | self.fc = nn.Linear(num_inputs, 128) 34 | self.fc_adv = nn.Linear(128, num_outputs) 35 | self.fc_val = nn.Linear(128, 1) 36 | 37 | for m in self.modules(): 38 | if isinstance(m, nn.Linear): 39 | nn.init.xavier_uniform(m.weight) 40 | 41 | def forward(self, x): 42 | x = F.relu(self.fc(x)) 43 | adv = self.fc_adv(x) 44 | val = self.fc_val(x) 45 | 46 | qvalue = val + (adv - adv.mean()) 47 | return qvalue 48 | ``` 49 | 50 | 실제 구현에서는 이렇게 간단하게 `adv`와 `val` 을 나눠서 마지막에 `qvalue`을 만들어 주는 것만으로 구현이 가능하다. -------------------------------------------------------------------------------- /rainbow/3-DuelDQN/Screenshot2018-11-1519-f88e4bf3-f581-4b24-a07e-af467a4bba64.14.23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choru-k/Reinforcement-Learning-Pytorch-Cartpole/ecb7b622cfefe825ac95388cceb6752413d90a2a/rainbow/3-DuelDQN/Screenshot2018-11-1519-f88e4bf3-f581-4b24-a07e-af467a4bba64.14.23.png -------------------------------------------------------------------------------- /rainbow/3-DuelDQN/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | -------------------------------------------------------------------------------- /rainbow/3-DuelDQN/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple 3 | 4 | 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 6 | 7 | 8 | class Memory(object): 9 | def __init__(self, capacity): 10 | self.memory = [] 11 | self.capacity = capacity 12 | self.position = 0 13 | 14 | def push(self, state, next_state, action, reward, mask): 15 | """Saves a transition.""" 16 | if len(self.memory) < self.capacity: 17 | self.memory.append(Transition(state, next_state, action, reward, mask)) 18 | self.memory[self.position] = Transition(state, next_state, action, reward, mask) 19 | self.position = (self.position + 1) % self.capacity 20 | 21 | def sample(self, batch_size): 22 | transitions = random.sample(self.memory, batch_size) 23 | batch = Transition(*zip(*transitions)) 24 | return batch 25 | 26 | def __len__(self): 27 | return len(self.memory) 28 | -------------------------------------------------------------------------------- /rainbow/3-DuelDQN/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from config import gamma 5 | 6 | class DuelDQNet(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(DuelDQNet, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc = nn.Linear(num_inputs, 128) 13 | self.fc_adv = nn.Linear(128, num_outputs) 14 | self.fc_val = nn.Linear(128, 1) 15 | 16 | for m in self.modules(): 17 | if isinstance(m, nn.Linear): 18 | nn.init.xavier_uniform(m.weight) 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc(x)) 22 | adv = self.fc_adv(x) 23 | adv = adv.view(-1, self.num_outputs) 24 | val = self.fc_val(x) 25 | val = val.view(-1, 1) 26 | 27 | qvalue = val + (adv - adv.mean(dim=1, keepdim=True)) 28 | return qvalue 29 | 30 | @classmethod 31 | def train_model(cls, online_net, target_net, optimizer, batch): 32 | states = torch.stack(batch.state) 33 | next_states = torch.stack(batch.next_state) 34 | actions = torch.Tensor(batch.action).float() 35 | rewards = torch.Tensor(batch.reward) 36 | masks = torch.Tensor(batch.mask) 37 | 38 | pred = online_net(states).squeeze(1) 39 | next_pred = target_net(next_states).squeeze(1) 40 | 41 | pred = torch.sum(pred.mul(actions), dim=1) 42 | 43 | target = rewards + masks * gamma * next_pred.max(1)[0] 44 | 45 | 46 | loss = F.mse_loss(pred, target.detach()) 47 | optimizer.zero_grad() 48 | loss.backward() 49 | optimizer.step() 50 | 51 | return loss 52 | 53 | def get_action(self, input): 54 | qvalue = self.forward(input) 55 | _, action = torch.max(qvalue, 1) 56 | return action.numpy()[0] 57 | -------------------------------------------------------------------------------- /rainbow/3-DuelDQN/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from model import DuelDQNet 12 | from memory import Memory 13 | from tensorboardX import SummaryWriter 14 | from config import env_name, gamma, initial_exploration, batch_size, update_target, log_interval, goal_score, device, replay_memory_capacity, lr 15 | 16 | 17 | def get_action(state, target_net, epsilon, env): 18 | if np.random.rand() <= epsilon: 19 | return env.action_space.sample() 20 | else: 21 | return target_net.get_action(state) 22 | 23 | 24 | 25 | def update_target_model(online_net, target_net): 26 | # Target -> Net 27 | target_net.load_state_dict(online_net.state_dict()) 28 | 29 | 30 | def main(): 31 | env = gym.make(env_name) 32 | env.seed(500) 33 | torch.manual_seed(500) 34 | 35 | num_inputs = env.observation_space.shape[0] 36 | num_actions = env.action_space.n 37 | print('state size:', num_inputs) 38 | print('action size:', num_actions) 39 | 40 | online_net = DuelDQNet(num_inputs, num_actions) 41 | target_net = DuelDQNet(num_inputs, num_actions) 42 | update_target_model(online_net, target_net) 43 | 44 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 45 | writer = SummaryWriter('logs') 46 | 47 | online_net.to(device) 48 | target_net.to(device) 49 | online_net.train() 50 | target_net.train() 51 | memory = Memory(replay_memory_capacity) 52 | running_score = 0 53 | epsilon = 1.0 54 | steps = 0 55 | loss = 0 56 | 57 | for e in range(3000): 58 | done = False 59 | 60 | score = 0 61 | state = env.reset() 62 | state = torch.Tensor(state).to(device) 63 | state = state.unsqueeze(0) 64 | 65 | while not done: 66 | steps += 1 67 | action = get_action(state, target_net, epsilon, env) 68 | next_state, reward, done, _ = env.step(action) 69 | 70 | next_state = torch.Tensor(next_state) 71 | next_state = next_state.unsqueeze(0) 72 | 73 | mask = 0 if done else 1 74 | reward = reward if not done or score == 499 else -1 75 | action_one_hot = np.zeros(2) 76 | action_one_hot[action] = 1 77 | memory.push(state, next_state, action_one_hot, reward, mask) 78 | 79 | score += reward 80 | state = next_state 81 | 82 | if steps > initial_exploration: 83 | epsilon -= 0.00005 84 | epsilon = max(epsilon, 0.1) 85 | 86 | batch = memory.sample(batch_size) 87 | loss = DuelDQNet.train_model(online_net, target_net, optimizer, batch) 88 | 89 | if steps % update_target == 0: 90 | update_target_model(online_net, target_net) 91 | 92 | score = score if score == 500.0 else score + 1 93 | running_score = 0.99 * running_score + 0.01 * score 94 | if e % log_interval == 0: 95 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 96 | e, running_score, epsilon)) 97 | writer.add_scalar('log/score', float(running_score), e) 98 | writer.add_scalar('log/loss', float(loss), e) 99 | 100 | if running_score > goal_score: 101 | break 102 | 103 | 104 | if __name__=="__main__": 105 | main() 106 | -------------------------------------------------------------------------------- /rainbow/4-multistep/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 12 | replay_memory_capacity = 1000 13 | 14 | 15 | n_step = 3 16 | -------------------------------------------------------------------------------- /rainbow/4-multistep/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | from config import n_step, gamma 4 | 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 6 | 7 | 8 | class Memory(object): 9 | def __init__(self, capacity): 10 | self.memory = deque(maxlen=capacity) 11 | self.capacity = capacity 12 | self.reset_local() 13 | 14 | def reset_local(self): 15 | self.local_step = 0 16 | self.local_state = None 17 | self.local_action = None 18 | self.local_rewards = [] 19 | 20 | def push(self, state, next_state, action, reward, mask): 21 | self.local_step += 1 22 | self.local_rewards.append(reward) 23 | if self.local_step == 1: 24 | self.local_state = state 25 | self.local_action = action 26 | if self.local_step == n_step: 27 | reward = 0 28 | for idx, local_reward in enumerate(self.local_rewards): 29 | reward += (gamma ** idx) * local_reward 30 | self.memory.append(Transition(self.local_state, next_state, self.local_action, reward, mask)) 31 | self.reset_local() 32 | if mask == 0: 33 | self.reset_local() 34 | 35 | def sample(self, batch_size): 36 | transitions = random.sample(self.memory, batch_size) 37 | batch = Transition(*zip(*transitions)) 38 | return batch 39 | 40 | def __len__(self): 41 | return len(self.memory) 42 | -------------------------------------------------------------------------------- /rainbow/4-multistep/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from config import gamma, n_step 6 | class QNet(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(QNet, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc1 = nn.Linear(num_inputs, 128) 13 | self.fc2 = nn.Linear(128, num_outputs) 14 | 15 | for m in self.modules(): 16 | if isinstance(m, nn.Linear): 17 | nn.init.xavier_uniform(m.weight) 18 | 19 | def forward(self, x): 20 | x = F.relu(self.fc1(x)) 21 | qvalue = self.fc2(x) 22 | return qvalue 23 | 24 | @classmethod 25 | def train_model(cls, online_net, target_net, optimizer, batch): 26 | states = torch.stack(batch.state) 27 | next_states = torch.stack(batch.next_state) 28 | actions = torch.Tensor(batch.action).float() 29 | rewards = torch.Tensor(batch.reward) 30 | masks = torch.Tensor(batch.mask) 31 | 32 | pred = online_net(states).squeeze(1) 33 | next_pred = target_net(next_states).squeeze(1) 34 | 35 | pred = torch.sum(pred.mul(actions), dim=1) 36 | 37 | target = rewards + masks * (gamma ** n_step) * next_pred.max(1)[0] 38 | 39 | loss = F.mse_loss(pred, target.detach()) 40 | optimizer.zero_grad() 41 | loss.backward() 42 | optimizer.step() 43 | 44 | return loss 45 | 46 | def get_action(self, input): 47 | qvalue = self.forward(input) 48 | _, action = torch.max(qvalue, 1) 49 | return action.numpy()[0] 50 | -------------------------------------------------------------------------------- /rainbow/4-multistep/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import QNet 11 | from memory import Memory 12 | from tensorboardX import SummaryWriter 13 | 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr 15 | 16 | 17 | def get_action(state, target_net, epsilon, env): 18 | if np.random.rand() <= epsilon: 19 | return env.action_space.sample() 20 | else: 21 | return target_net.get_action(state) 22 | 23 | 24 | 25 | def update_target_model(online_net, target_net): 26 | # Target <- Net 27 | target_net.load_state_dict(online_net.state_dict()) 28 | 29 | 30 | def main(): 31 | env = gym.make(env_name) 32 | env.seed(500) 33 | torch.manual_seed(500) 34 | 35 | num_inputs = env.observation_space.shape[0] 36 | num_actions = env.action_space.n 37 | print('state size:', num_inputs) 38 | print('action size:', num_actions) 39 | 40 | online_net = QNet(num_inputs, num_actions) 41 | target_net = QNet(num_inputs, num_actions) 42 | update_target_model(online_net, target_net) 43 | 44 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 45 | writer = SummaryWriter('logs') 46 | 47 | online_net.to(device) 48 | target_net.to(device) 49 | online_net.train() 50 | target_net.train() 51 | memory = Memory(replay_memory_capacity) 52 | running_score = 0 53 | epsilon = 1.0 54 | steps = 0 55 | loss = 0 56 | 57 | for e in range(3000): 58 | done = False 59 | 60 | score = 0 61 | state = env.reset() 62 | state = torch.Tensor(state).to(device) 63 | state = state.unsqueeze(0) 64 | 65 | while not done: 66 | 67 | steps += 1 68 | 69 | action = get_action(state, target_net, epsilon, env) 70 | next_state, reward, done, _ = env.step(action) 71 | 72 | next_state = torch.Tensor(next_state) 73 | next_state = next_state.unsqueeze(0) 74 | 75 | mask = 0 if done else 1 76 | reward = reward if not done or score == 499 else -1 77 | action_one_hot = np.zeros(2) 78 | action_one_hot[action] = 1 79 | memory.push(state, next_state, action_one_hot, reward, mask) 80 | 81 | score += reward 82 | state = next_state 83 | 84 | if steps > initial_exploration: 85 | epsilon -= 0.00005 86 | epsilon = max(epsilon, 0.1) 87 | 88 | batch = memory.sample(batch_size) 89 | loss = QNet.train_model(online_net, target_net, optimizer, batch) 90 | 91 | if steps % update_target == 0: 92 | update_target_model(online_net, target_net) 93 | 94 | score = score if score == 500.0 else score + 1 95 | running_score = 0.99 * running_score + 0.01 * score 96 | if e % log_interval == 0: 97 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 98 | e, running_score, epsilon)) 99 | writer.add_scalar('log/score', float(running_score), e) 100 | writer.add_scalar('log/loss', float(loss), e) 101 | 102 | if running_score > goal_score: 103 | break 104 | 105 | 106 | if __name__=="__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /rainbow/5-per/Screenshot2018-11-1514-a431e580-fd9d-4a07-afd1-5f80e0042c23.45.16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choru-k/Reinforcement-Learning-Pytorch-Cartpole/ecb7b622cfefe825ac95388cceb6752413d90a2a/rainbow/5-per/Screenshot2018-11-1514-a431e580-fd9d-4a07-afd1-5f80e0042c23.45.16.png -------------------------------------------------------------------------------- /rainbow/5-per/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | small_epsilon = 0.0001 15 | alpha = 0.5 16 | beta_start = 0.1 17 | -------------------------------------------------------------------------------- /rainbow/5-per/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from collections import namedtuple, deque 4 | import torch 5 | from model import QNet 6 | from config import small_epsilon, gamma, alpha, device 7 | 8 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 9 | 10 | 11 | class Memory_With_TDError(object): 12 | def __init__(self, capacity): 13 | self.memory = deque(maxlen=capacity) 14 | self.memory_probabiliy = deque(maxlen=capacity) 15 | self.capacity = capacity 16 | 17 | def push(self, state, next_state, action, reward, mask): 18 | """Saves a transition.""" 19 | if len(self.memory) > 0: 20 | max_probability = max(self.memory_probabiliy) 21 | else: 22 | max_probability = small_epsilon 23 | self.memory.append(Transition(state, next_state, action, reward, mask)) 24 | self.memory_probabiliy.append(max_probability) 25 | 26 | def sample(self, batch_size, net, target_net, beta): 27 | probability_sum = sum(self.memory_probabiliy) 28 | p = [probability / probability_sum for probability in self.memory_probabiliy] 29 | # print(len(self.memory_probabiliy)) 30 | indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p) 31 | transitions = [self.memory[idx] for idx in indexes] 32 | transitions_p = [p[idx] for idx in indexes] 33 | batch = Transition(*zip(*transitions)) 34 | 35 | weights = [pow(self.capacity * p_j, -beta) for p_j in transitions_p] 36 | weights = torch.Tensor(weights).to(device) 37 | # print(weights) 38 | weights = weights / weights.max() 39 | # print(weights) 40 | 41 | td_error = QNet.get_td_error(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask) 42 | 43 | td_error_idx = 0 44 | for idx in indexes: 45 | self.memory_probabiliy[idx] = pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item() 46 | # print(pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item()) 47 | td_error_idx += 1 48 | 49 | 50 | return batch, weights 51 | 52 | def __len__(self): 53 | return len(self.memory) 54 | -------------------------------------------------------------------------------- /rainbow/5-per/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from config import gamma 5 | 6 | class QNet(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(QNet, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc1 = nn.Linear(num_inputs, 128) 13 | self.fc2 = nn.Linear(128, num_outputs) 14 | 15 | for m in self.modules(): 16 | if isinstance(m, nn.Linear): 17 | nn.init.xavier_uniform(m.weight) 18 | 19 | def forward(self, x): 20 | x = F.relu(self.fc1(x)) 21 | qvalue = self.fc2(x) 22 | return qvalue 23 | 24 | @classmethod 25 | def get_td_error(cls, online_net, target_net, state, next_state, action, reward, mask): 26 | state = torch.stack(state) 27 | next_state = torch.stack(next_state) 28 | action = torch.Tensor(action) 29 | reward = torch.Tensor(reward) 30 | mask = torch.Tensor(mask) 31 | 32 | pred = online_net(state).squeeze(1) 33 | next_pred = target_net(next_state).squeeze(1) 34 | 35 | pred = torch.sum(pred.mul(action), dim=1) 36 | 37 | target = reward + mask * gamma * next_pred.max(1)[0] 38 | 39 | td_error = pred - target.detach() 40 | 41 | return td_error 42 | 43 | @classmethod 44 | def train_model(cls, online_net, target_net, optimizer, batch, weights): 45 | td_error = cls.get_td_error(online_net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask) 46 | loss = pow(td_error, 2) * weights 47 | loss = loss.mean() 48 | 49 | optimizer.zero_grad() 50 | loss.backward() 51 | optimizer.step() 52 | 53 | return loss 54 | 55 | def get_action(self, input): 56 | qvalue = self.forward(input) 57 | _, action = torch.max(qvalue, 1) 58 | return action.numpy()[0] 59 | -------------------------------------------------------------------------------- /rainbow/5-per/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from model import QNet 12 | from memory import Memory_With_TDError 13 | from tensorboardX import SummaryWriter 14 | from config import env_name, gamma, initial_exploration, batch_size, update_target, log_interval, goal_score, device, replay_memory_capacity, lr, beta_start 15 | 16 | 17 | def get_action(state, target_net, epsilon, env): 18 | if np.random.rand() <= epsilon: 19 | return env.action_space.sample() 20 | else: 21 | return target_net.get_action(state) 22 | 23 | 24 | def update_target_model(online_net, target_net): 25 | # Target -> Net 26 | target_net.load_state_dict(online_net.state_dict()) 27 | 28 | 29 | def main(): 30 | env = gym.make(env_name) 31 | env.seed(500) 32 | torch.manual_seed(500) 33 | 34 | num_inputs = env.observation_space.shape[0] 35 | num_actions = env.action_space.n 36 | print('state size:', num_inputs) 37 | print('action size:', num_actions) 38 | 39 | online_net = QNet(num_inputs, num_actions) 40 | target_net = QNet(num_inputs, num_actions) 41 | update_target_model(online_net, target_net) 42 | 43 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 44 | writer = SummaryWriter('logs') 45 | 46 | online_net.to(device) 47 | target_net.to(device) 48 | online_net.train() 49 | target_net.train() 50 | memory = Memory_With_TDError(replay_memory_capacity) 51 | running_score = 0 52 | epsilon = 1.0 53 | steps = 0 54 | beta = beta_start 55 | loss = 0 56 | 57 | for e in range(3000): 58 | done = False 59 | 60 | score = 0 61 | state = env.reset() 62 | state = torch.Tensor(state).to(device) 63 | state = state.unsqueeze(0) 64 | 65 | while not done: 66 | steps += 1 67 | action = get_action(state, target_net, epsilon, env) 68 | next_state, reward, done, _ = env.step(action) 69 | 70 | next_state = torch.Tensor(next_state) 71 | next_state = next_state.unsqueeze(0) 72 | 73 | mask = 0 if done else 1 74 | reward = reward if not done or score == 499 else -1 75 | action_one_hot = np.zeros(2) 76 | action_one_hot[action] = 1 77 | memory.push(state, next_state, action_one_hot, reward, mask) 78 | 79 | score += reward 80 | state = next_state 81 | 82 | if steps > initial_exploration: 83 | epsilon -= 0.00005 84 | epsilon = max(epsilon, 0.1) 85 | beta += 0.00005 86 | beta = min(1, beta) 87 | 88 | batch, weights = memory.sample(batch_size, online_net, target_net, beta) 89 | loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) 90 | 91 | if steps % update_target == 0: 92 | update_target_model(online_net, target_net) 93 | 94 | score = score if score == 500.0 else score + 1 95 | running_score = 0.99 * running_score + 0.01 * score 96 | if e % log_interval == 0: 97 | print('{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'.format( 98 | e, running_score, epsilon, beta)) 99 | writer.add_scalar('log/score', float(running_score), e) 100 | writer.add_scalar('log/loss', float(loss), e) 101 | 102 | if running_score > goal_score: 103 | break 104 | 105 | 106 | if __name__=="__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /rainbow/6-Nosiy_net/Screenshot2018-11-1616-fd936286-4e40-4962-99ff-1ddd3b7deeb8.36.21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choru-k/Reinforcement-Learning-Pytorch-Cartpole/ecb7b622cfefe825ac95388cceb6752413d90a2a/rainbow/6-Nosiy_net/Screenshot2018-11-1616-fd936286-4e40-4962-99ff-1ddd3b7deeb8.36.21.png -------------------------------------------------------------------------------- /rainbow/6-Nosiy_net/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | 15 | sigma_zero = 0.5 16 | -------------------------------------------------------------------------------- /rainbow/6-Nosiy_net/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 6 | 7 | 8 | class Memory(object): 9 | def __init__(self, capacity): 10 | self.memory = deque(maxlen=capacity) 11 | self.capacity = capacity 12 | 13 | def push(self, state, next_state, action, reward, mask): 14 | self.memory.append(Transition(state, next_state, action, reward, mask)) 15 | 16 | def sample(self, batch_size): 17 | transitions = random.sample(self.memory, batch_size) 18 | batch = Transition(*zip(*transitions)) 19 | return batch 20 | 21 | def __len__(self): 22 | return len(self.memory) 23 | -------------------------------------------------------------------------------- /rainbow/6-Nosiy_net/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | import math 6 | from config import gamma, sigma_zero 7 | 8 | class NoisyLinear(nn.Module): 9 | def __init__(self, in_features, out_features): 10 | super(NoisyLinear, self).__init__() 11 | self.in_features = in_features 12 | self.out_features = out_features 13 | self.sigma_zero = sigma_zero 14 | self.weight_mu = nn.Parameter(torch.empty(out_features, in_features)) 15 | self.weight_sigma = nn.Parameter(torch.empty(out_features, in_features)) 16 | self.register_buffer('weight_epsilon', torch.empty(out_features, in_features)) 17 | self.bias_mu = nn.Parameter(torch.empty(out_features)) 18 | self.bias_sigma = nn.Parameter(torch.empty(out_features)) 19 | self.register_buffer('bias_epsilon', torch.empty(out_features)) 20 | self.reset_parameters() 21 | self.reset_noise() 22 | 23 | def reset_parameters(self): 24 | mu_range = 1 / math.sqrt(self.in_features) 25 | self.weight_mu.data.uniform_(-mu_range, mu_range) 26 | self.weight_sigma.data.fill_(self.sigma_zero / math.sqrt(self.in_features)) 27 | self.bias_mu.data.uniform_(-mu_range, mu_range) 28 | self.bias_sigma.data.fill_(self.sigma_zero / math.sqrt(self.out_features)) 29 | 30 | def _scale_noise(self, size): 31 | x = torch.randn(size) 32 | return x.sign().mul_(x.abs().sqrt_()) 33 | 34 | def reset_noise(self): 35 | epsilon_in = self._scale_noise(self.in_features) 36 | epsilon_out = self._scale_noise(self.out_features) 37 | self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in)) 38 | self.bias_epsilon.copy_(epsilon_out) 39 | 40 | def forward(self, input): 41 | return F.linear(input, self.weight_mu + self.weight_sigma * self.weight_epsilon, self.bias_mu + self.bias_sigma * self.bias_epsilon) 42 | 43 | 44 | class QNet(nn.Module): 45 | def __init__(self, num_inputs, num_outputs): 46 | super(QNet, self).__init__() 47 | self.num_inputs = num_inputs 48 | self.num_outputs = num_outputs 49 | 50 | self.fc1 = nn.Linear(num_inputs, 128) 51 | self.fc2 = NoisyLinear(128, num_outputs) 52 | 53 | for m in self.modules(): 54 | if isinstance(m, nn.Linear): 55 | nn.init.xavier_uniform(m.weight) 56 | 57 | def forward(self, x): 58 | x = F.relu(self.fc1(x)) 59 | qvalue = self.fc2(x) 60 | return qvalue 61 | 62 | @classmethod 63 | def train_model(cls, online_net, target_net, optimizer, batch): 64 | states = torch.stack(batch.state) 65 | next_states = torch.stack(batch.next_state) 66 | actions = torch.Tensor(batch.action).float() 67 | rewards = torch.Tensor(batch.reward) 68 | masks = torch.Tensor(batch.mask) 69 | 70 | pred = online_net(states).squeeze(1) 71 | next_pred = target_net(next_states).squeeze(1) 72 | 73 | pred = torch.sum(pred.mul(actions), dim=1) 74 | 75 | target = rewards + masks * gamma * next_pred.max(1)[0] 76 | 77 | loss = F.mse_loss(pred, target.detach()) 78 | optimizer.zero_grad() 79 | loss.backward() 80 | optimizer.step() 81 | online_net.reset_noise() 82 | 83 | return loss 84 | 85 | def get_action(self, input): 86 | qvalue = self.forward(input) 87 | _, action = torch.max(qvalue, 1) 88 | return action.numpy()[0] 89 | 90 | def reset_noise(self): 91 | self.fc2.reset_noise() 92 | -------------------------------------------------------------------------------- /rainbow/6-Nosiy_net/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import QNet 11 | from memory import Memory 12 | from tensorboardX import SummaryWriter 13 | 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr 15 | 16 | 17 | def get_action(state, target_net, epsilon, env): 18 | if np.random.rand() <= epsilon: 19 | return env.action_space.sample() 20 | else: 21 | return target_net.get_action(state) 22 | 23 | def update_target_model(online_net, target_net): 24 | # Target <- Net 25 | target_net.load_state_dict(online_net.state_dict()) 26 | 27 | 28 | def main(): 29 | env = gym.make(env_name) 30 | env.seed(500) 31 | torch.manual_seed(500) 32 | 33 | num_inputs = env.observation_space.shape[0] 34 | num_actions = env.action_space.n 35 | print('state size:', num_inputs) 36 | print('action size:', num_actions) 37 | 38 | online_net = QNet(num_inputs, num_actions) 39 | target_net = QNet(num_inputs, num_actions) 40 | update_target_model(online_net, target_net) 41 | 42 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 43 | writer = SummaryWriter('logs') 44 | 45 | online_net.to(device) 46 | target_net.to(device) 47 | online_net.train() 48 | target_net.train() 49 | memory = Memory(replay_memory_capacity) 50 | running_score = 0 51 | epsilon = 1.0 52 | steps = 0 53 | loss = 0 54 | 55 | for e in range(3000): 56 | done = False 57 | 58 | score = 0 59 | state = env.reset() 60 | state = torch.Tensor(state).to(device) 61 | state = state.unsqueeze(0) 62 | 63 | while not done: 64 | steps += 1 65 | 66 | action = get_action(state, target_net, epsilon, env) 67 | next_state, reward, done, _ = env.step(action) 68 | 69 | next_state = torch.Tensor(next_state) 70 | next_state = next_state.unsqueeze(0) 71 | 72 | mask = 0 if done else 1 73 | reward = reward if not done or score == 499 else -1 74 | action_one_hot = np.zeros(2) 75 | action_one_hot[action] = 1 76 | memory.push(state, next_state, action_one_hot, reward, mask) 77 | 78 | score += reward 79 | state = next_state 80 | 81 | if steps > initial_exploration: 82 | epsilon -= 0.00005 83 | epsilon = max(epsilon, 0.1) 84 | 85 | batch = memory.sample(batch_size) 86 | loss = QNet.train_model(online_net, target_net, optimizer, batch) 87 | 88 | if steps % update_target == 0: 89 | update_target_model(online_net, target_net) 90 | 91 | score = score if score == 500.0 else score + 1 92 | running_score = 0.99 * running_score + 0.01 * score 93 | if e % log_interval == 0: 94 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 95 | e, running_score, epsilon)) 96 | writer.add_scalar('log/score', float(running_score), e) 97 | writer.add_scalar('log/loss', float(loss), e) 98 | 99 | if running_score > goal_score: 100 | break 101 | 102 | 103 | if __name__=="__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /rainbow/7-distributional_c51/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.0001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | 15 | num_support = 8 16 | V_max = 5 17 | V_min = -5 18 | -------------------------------------------------------------------------------- /rainbow/7-distributional_c51/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | 4 | 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 6 | 7 | 8 | class Memory(object): 9 | def __init__(self, capacity): 10 | self.memory = deque(maxlen=capacity) 11 | self.capacity = capacity 12 | 13 | def push(self, state, next_state, action, reward, mask): 14 | self.memory.append(Transition(state, next_state, action, reward, mask)) 15 | 16 | def sample(self, batch_size): 17 | transitions = random.sample(self.memory, batch_size) 18 | batch = Transition(*zip(*transitions)) 19 | return batch 20 | 21 | def __len__(self): 22 | return len(self.memory) 23 | -------------------------------------------------------------------------------- /rainbow/7-distributional_c51/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from config import batch_size, num_support, gamma, V_max, V_min 7 | 8 | 9 | class Distributional_C51(nn.Module): 10 | def __init__(self, num_inputs, num_outputs): 11 | super(Distributional_C51, self).__init__() 12 | self.num_inputs = num_inputs 13 | self.num_outputs = num_outputs 14 | 15 | self.dz = float(V_max - V_min) / (num_support - 1) 16 | self.z = torch.Tensor([V_min + i * self.dz for i in range(num_support)]) 17 | 18 | self.fc1 = nn.Linear(num_inputs, 128) 19 | self.fc2 = nn.Linear(128, num_outputs * num_support) 20 | 21 | for m in self.modules(): 22 | if isinstance(m, nn.Linear): 23 | nn.init.xavier_uniform(m.weight) 24 | 25 | 26 | def forward(self, input): 27 | x = F.relu(self.fc1(input)) 28 | x = self.fc2(x) 29 | z = x.view(-1, self.num_outputs, num_support) 30 | p = nn.Softmax(dim=2)(z) 31 | return p 32 | 33 | 34 | def get_action(self, input): 35 | p = self.forward(input) 36 | p = p.squeeze(0) 37 | z_space = self.z.repeat(self.num_outputs, 1) 38 | Q = torch.sum(p * z_space, dim=1) 39 | action = torch.argmax(Q) 40 | return action.item() 41 | 42 | @classmethod 43 | def get_m(cls, _rewards, _masks, _prob_next_states_action): 44 | rewards = _rewards.numpy() 45 | masks = _masks.numpy() 46 | prob_next_states_action = _prob_next_states_action.detach().numpy() 47 | m_prob = np.zeros([batch_size, num_support], dtype=np.float32) 48 | 49 | dz = float(V_max - V_min) / (num_support - 1) 50 | batch_id = range(batch_size) 51 | for j in range(num_support): 52 | Tz = np.clip(rewards + masks * gamma * (V_min + j * dz), V_min, V_max) 53 | bj = (Tz - V_min) / dz 54 | 55 | lj = np.floor(bj).astype(np.int64) 56 | uj = np.ceil(bj).astype(np.int64) 57 | 58 | blj = (bj - lj) 59 | buj = (uj - bj) 60 | 61 | m_prob[batch_id, lj[batch_id]] += ((1 - masks) + masks * (prob_next_states_action[batch_id, j])) * buj[batch_id] 62 | m_prob[batch_id, uj[batch_id]] += ((1 - masks) + masks * (prob_next_states_action[batch_id, j])) * blj[batch_id] 63 | 64 | return m_prob 65 | 66 | 67 | @classmethod 68 | def train_model(cls, online_net, target_net, optimizer, batch): 69 | states = torch.stack(batch.state) 70 | next_states = torch.stack(batch.next_state) 71 | actions = torch.Tensor(batch.action).int() 72 | rewards = torch.Tensor(batch.reward) 73 | masks = torch.Tensor(batch.mask) 74 | 75 | z_space = online_net.z.repeat(batch_size, online_net.num_outputs, 1) 76 | prob_next_states = target_net(next_states) 77 | Q_next_state = torch.sum(prob_next_states * z_space, 2) 78 | next_actions = torch.argmax(Q_next_state, 1) 79 | prob_next_states_action = torch.stack([prob_next_states[i, action, :] for i, action in enumerate(next_actions)]) 80 | 81 | m_prob = cls.get_m(rewards, masks, prob_next_states_action) 82 | m_prob = torch.tensor(m_prob) 83 | 84 | m_prob = (m_prob / torch.sum(m_prob, dim=1, keepdim=True)).detach() 85 | expand_dim_action = torch.unsqueeze(actions, -1) 86 | p = torch.sum(online_net(states) * expand_dim_action.float(), dim=1) 87 | loss = -torch.sum(m_prob * torch.log(p + 1e-20), 1) 88 | loss = loss.mean() 89 | 90 | optimizer.zero_grad() 91 | loss.backward() 92 | optimizer.step() 93 | 94 | return loss 95 | -------------------------------------------------------------------------------- /rainbow/7-distributional_c51/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from tensorboardX import SummaryWriter 12 | 13 | from model import Distributional_C51 14 | from memory import Memory 15 | 16 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr 17 | 18 | 19 | def get_action(state, target_net, epsilon, env): 20 | if np.random.rand() <= epsilon: 21 | return env.action_space.sample() 22 | else: 23 | return target_net.get_action(state) 24 | 25 | def update_target_model(online_net, target_net): 26 | # Target <- Net 27 | target_net.load_state_dict(online_net.state_dict()) 28 | 29 | 30 | 31 | def main(): 32 | env = gym.make(env_name) 33 | env.seed(500) 34 | torch.manual_seed(500) 35 | 36 | num_inputs = env.observation_space.shape[0] 37 | num_actions = env.action_space.n 38 | print('state size:', num_inputs) 39 | print('action size:', num_actions) 40 | 41 | online_net = Distributional_C51(num_inputs, num_actions) 42 | target_net = Distributional_C51(num_inputs, num_actions) 43 | update_target_model(online_net, target_net) 44 | 45 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 46 | writer = SummaryWriter('logs') 47 | 48 | online_net.to(device) 49 | target_net.to(device) 50 | online_net.train() 51 | target_net.train() 52 | memory = Memory(replay_memory_capacity) 53 | running_score = 0 54 | epsilon = 1.0 55 | steps = 0 56 | loss = 0 57 | 58 | for e in range(3000): 59 | done = False 60 | 61 | score = 0 62 | state = env.reset() 63 | state = torch.Tensor(state) 64 | state = state.unsqueeze(0) 65 | 66 | while not done: 67 | steps += 1 68 | action = get_action(state, target_net, epsilon, env) 69 | next_state, reward, done, _ = env.step(action) 70 | 71 | next_state = torch.Tensor(next_state) 72 | next_state = next_state.unsqueeze(0) 73 | 74 | mask = 0 if done else 1 75 | reward = reward if not done or score == 499 else -1 76 | action_one_hot = np.zeros(2) 77 | action_one_hot[action] = 1 78 | memory.push(state, next_state, action_one_hot, reward, mask) 79 | 80 | score += reward 81 | state = next_state 82 | 83 | if steps > initial_exploration: 84 | epsilon -= 0.00005 85 | epsilon = max(epsilon, 0.1) 86 | 87 | batch = memory.sample(batch_size) 88 | loss = Distributional_C51.train_model(online_net, target_net, optimizer, batch) 89 | 90 | if steps % update_target == 0: 91 | update_target_model(online_net, target_net) 92 | 93 | score = score if score == 500.0 else score + 1 94 | running_score = 0.99 * running_score + 0.01 * score 95 | if e % log_interval == 0: 96 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 97 | e, running_score, epsilon)) 98 | writer.add_scalar('log/score', float(running_score), e) 99 | writer.add_scalar('log/loss', float(loss), e) 100 | 101 | if running_score > goal_score: 102 | break 103 | 104 | if __name__=="__main__": 105 | main() 106 | -------------------------------------------------------------------------------- /rainbow/8-Not_Distributional/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | # Multi_Step 15 | n_step = 3 16 | 17 | # PER 18 | small_epsilon = 0.0001 19 | alpha = 0.5 20 | beta_start = 0.1 21 | 22 | # Noisy Net 23 | sigma_zero = 0.5 24 | -------------------------------------------------------------------------------- /rainbow/8-Not_Distributional/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from collections import namedtuple, deque 4 | import torch 5 | from model import QNet 6 | from config import small_epsilon, gamma, alpha, device, n_step 7 | 8 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 9 | 10 | 11 | class Memory_With_TDError(object): 12 | def __init__(self, capacity): 13 | self.memory = [] 14 | self.memory_probabiliy = [] 15 | self.capacity = capacity 16 | self.position = 0 17 | self.reset_local() 18 | 19 | def reset_local(self): 20 | self.local_step = 0 21 | self.local_state = None 22 | self.local_action = None 23 | self.local_rewards = [] 24 | 25 | def push(self, state, next_state, action, reward, mask): 26 | self.local_step += 1 27 | self.local_rewards.append(reward) 28 | if self.local_step == 1: 29 | self.local_state = state 30 | self.local_action = action 31 | if self.local_step == n_step: 32 | reward = 0 33 | for idx, local_reward in enumerate(self.local_rewards): 34 | reward += (gamma ** idx) * local_reward 35 | self.push_to_memory(self.local_state, next_state, self.local_action, reward, mask) 36 | self.reset_local() 37 | if mask == 0: 38 | self.reset_local() 39 | 40 | 41 | def push_to_memory(self, state, next_state, action, reward, mask): 42 | if len(self.memory) > 0: 43 | max_probability = max(self.memory_probabiliy) 44 | else: 45 | max_probability = small_epsilon 46 | 47 | if len(self.memory) < self.capacity: 48 | self.memory.append(Transition(state, next_state, action, reward, mask)) 49 | self.memory_probabiliy.append(max_probability) 50 | else: 51 | self.memory[self.position] = Transition(state, next_state, action, reward, mask) 52 | self.memory_probabiliy[self.position] = max_probability 53 | 54 | self.position = (self.position + 1) % self.capacity 55 | 56 | def sample(self, batch_size, net, target_net, beta): 57 | probability_sum = sum(self.memory_probabiliy) 58 | p = [probability / probability_sum for probability in self.memory_probabiliy] 59 | 60 | indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p) 61 | transitions = [self.memory[idx] for idx in indexes] 62 | transitions_p = [p[idx] for idx in indexes] 63 | batch = Transition(*zip(*transitions)) 64 | 65 | weights = [pow(self.capacity * p_j, -beta) for p_j in transitions_p] 66 | weights = torch.Tensor(weights).to(device) 67 | weights = weights / weights.max() 68 | 69 | 70 | td_error = QNet.get_td_error(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask) 71 | td_error = td_error.detach() 72 | 73 | td_error_idx = 0 74 | for idx in indexes: 75 | self.memory_probabiliy[idx] = pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item() 76 | # print(pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item()) 77 | td_error_idx += 1 78 | 79 | 80 | return batch, weights 81 | 82 | def __len__(self): 83 | return len(self.memory) 84 | -------------------------------------------------------------------------------- /rainbow/8-Not_Distributional/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import math 5 | 6 | from config import gamma, sigma_zero, n_step 7 | 8 | class NoisyLinear(nn.Module): 9 | def __init__(self, in_features, out_features): 10 | super(NoisyLinear, self).__init__() 11 | self.in_features = in_features 12 | self.out_features = out_features 13 | self.sigma_zero = sigma_zero 14 | self.weight_mu = nn.Parameter(torch.empty(out_features, in_features)) 15 | self.weight_sigma = nn.Parameter(torch.empty(out_features, in_features)) 16 | self.register_buffer('weight_epsilon', torch.empty(out_features, in_features)) 17 | self.bias_mu = nn.Parameter(torch.empty(out_features)) 18 | self.bias_sigma = nn.Parameter(torch.empty(out_features)) 19 | self.register_buffer('bias_epsilon', torch.empty(out_features)) 20 | self.reset_parameters() 21 | self.reset_noise() 22 | 23 | def reset_parameters(self): 24 | mu_range = 1 / math.sqrt(self.in_features) 25 | self.weight_mu.data.uniform_(-mu_range, mu_range) 26 | self.weight_sigma.data.fill_(self.sigma_zero / math.sqrt(self.in_features)) 27 | self.bias_mu.data.uniform_(-mu_range, mu_range) 28 | self.bias_sigma.data.fill_(self.sigma_zero / math.sqrt(self.out_features)) 29 | 30 | def _scale_noise(self, size): 31 | x = torch.randn(size) 32 | return x.sign().mul_(x.abs().sqrt_()) 33 | 34 | def reset_noise(self): 35 | epsilon_in = self._scale_noise(self.in_features) 36 | epsilon_out = self._scale_noise(self.out_features) 37 | self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in)) 38 | self.bias_epsilon.copy_(epsilon_out) 39 | 40 | def forward(self, input): 41 | return F.linear(input, self.weight_mu + self.weight_sigma * self.weight_epsilon, self.bias_mu + self.bias_sigma * self.bias_epsilon) 42 | 43 | 44 | class QNet(nn.Module): 45 | def __init__(self, num_inputs, num_outputs): 46 | super(QNet, self).__init__() 47 | self.num_inputs = num_inputs 48 | self.num_outputs = num_outputs 49 | 50 | self.fc = nn.Linear(num_inputs, 128) 51 | self.fc_adv = NoisyLinear(128, num_outputs) 52 | self.fc_val = nn.Linear(128, 1) 53 | 54 | 55 | for m in self.modules(): 56 | if isinstance(m, nn.Linear): 57 | nn.init.xavier_uniform(m.weight) 58 | 59 | def forward(self, x): 60 | x = F.relu(self.fc(x)) 61 | adv = self.fc_adv(x) 62 | adv = adv.view(-1, self.num_outputs) 63 | val = self.fc_val(x) 64 | val = val.view(-1, 1) 65 | 66 | 67 | qvalue = val + (adv - adv.mean(dim=1, keepdim=True)) 68 | # (batch, action) = (batch) + ((batch, action) - (batch)) 69 | return qvalue 70 | 71 | @classmethod 72 | def get_td_error(cls, online_net, target_net, states, next_states, actions, rewards, masks): 73 | states = torch.stack(states) 74 | next_states = torch.stack(next_states) 75 | actions = torch.Tensor(actions) 76 | rewards = torch.Tensor(rewards) 77 | masks = torch.Tensor(masks) 78 | 79 | pred = online_net(states) 80 | qvalue = online_net(next_states) 81 | 82 | _, action_from_online_net = online_net(next_states).max(1) 83 | 84 | target_net.reset_noise() 85 | next_pred = target_net(next_states).squeeze(1) 86 | 87 | pred = torch.sum(pred.mul(actions), dim=1) 88 | 89 | target = rewards + masks * (gamma ** n_step) * next_pred.gather(1, action_from_online_net.unsqueeze(1)).squeeze(1) 90 | 91 | td_error = pred - target.detach() 92 | 93 | return td_error 94 | 95 | @classmethod 96 | def train_model(cls, online_net, target_net, optimizer, batch, weights): 97 | td_error = cls.get_td_error(online_net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask) 98 | 99 | loss = pow(td_error, 2) * weights 100 | loss = loss.mean() 101 | 102 | optimizer.zero_grad() 103 | loss.backward() 104 | optimizer.step() 105 | 106 | return loss 107 | 108 | def get_action(self, input): 109 | self.reset_noise() 110 | qvalue = self.forward(input) 111 | _, action = torch.max(qvalue, 1) 112 | return action.numpy()[0] 113 | 114 | def reset_noise(self): 115 | self.fc_adv.reset_noise() 116 | -------------------------------------------------------------------------------- /rainbow/8-Not_Distributional/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import QNet 11 | from memory import Memory_With_TDError 12 | from tensorboardX import SummaryWriter 13 | 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, beta_start 15 | 16 | def update_target_model(online_net, target_net): 17 | # Target <- Net 18 | target_net.load_state_dict(online_net.state_dict()) 19 | 20 | 21 | def main(): 22 | env = gym.make(env_name) 23 | env.seed(500) 24 | torch.manual_seed(500) 25 | 26 | num_inputs = env.observation_space.shape[0] 27 | num_actions = env.action_space.n 28 | print('state size:', num_inputs) 29 | print('action size:', num_actions) 30 | 31 | online_net = QNet(num_inputs, num_actions) 32 | target_net = QNet(num_inputs, num_actions) 33 | update_target_model(online_net, target_net) 34 | 35 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 36 | writer = SummaryWriter('logs') 37 | 38 | online_net.to(device) 39 | target_net.to(device) 40 | online_net.train() 41 | target_net.train() 42 | memory = Memory_With_TDError(replay_memory_capacity) 43 | running_score = 0 44 | steps = 0 45 | beta = beta_start 46 | loss = 0 47 | 48 | for e in range(3000): 49 | done = False 50 | 51 | score = 0 52 | state = env.reset() 53 | state = torch.Tensor(state).to(device) 54 | state = state.unsqueeze(0) 55 | 56 | while not done: 57 | steps += 1 58 | 59 | action = target_net.get_action(state) 60 | next_state, reward, done, _ = env.step(action) 61 | 62 | next_state = torch.Tensor(next_state) 63 | next_state = next_state.unsqueeze(0) 64 | 65 | mask = 0 if done else 1 66 | reward = reward if not done or score == 499 else -1 67 | action_one_hot = np.zeros(2) 68 | action_one_hot[action] = 1 69 | memory.push(state, next_state, action_one_hot, reward, mask) 70 | 71 | score += reward 72 | state = next_state 73 | 74 | if steps > initial_exploration: 75 | beta += 0.00005 76 | beta = min(1, beta) 77 | 78 | batch, weights = memory.sample(batch_size, online_net, target_net, beta) 79 | loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) 80 | 81 | if steps % update_target == 0: 82 | update_target_model(online_net, target_net) 83 | 84 | score = score if score == 500.0 else score + 1 85 | running_score = 0.99 * running_score + 0.01 * score 86 | if e % log_interval == 0: 87 | print('{} episode | score: {:.2f} | beta: {:.2f}'.format( 88 | e, running_score, beta)) 89 | writer.add_scalar('log/score', float(running_score), e) 90 | writer.add_scalar('log/loss', float(loss), e) 91 | 92 | if running_score > goal_score: 93 | break 94 | 95 | 96 | if __name__=="__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /rainbow/9-Rainbow/README-KR.md: -------------------------------------------------------------------------------- 1 | # Rainbow 2 | 3 | Last Edited: Nov 19, 2018 8:32 PM 4 | 5 | ## Duel + Distributional 6 | 7 | 일단 지금까지에서 Q는 밑으로 정의하고 8 | 9 | $$Q(s,a) = R(s,a) + \gamma \ argmax_{a'}Q(s',a')$$ 10 | 11 | Duel 에서 밑의 식을 사용하였습니다. 12 | 13 | $$Q(s,a) = V(s) + A(s,a)$$ 14 | 15 | 그리고 Distributional 에서는 16 | 17 | $$Z(s,a)=R(s,a)+ \gamma \ Z(s',a')$$ 18 | 19 | 입니다. 여기서 Z 는 확률분포 입니다. 이 두 개를 같이 적용 한다면 20 | 21 | $$Z(s,a) = V(s) + A(s,a)$$ 22 | 23 | V와 A 을 확률 분포로써 적용 할 수 있을 것 같습니다. V 는 `(num_support)` 의 차원을 갖고 A 는 `(action_space, num_support)` 의 차원을 가질 것입니다. 24 | 25 | ```python 26 | def forward(self, x): 27 | x = F.relu(self.fc(x)) 28 | adv = self.fc_adv(x) 29 | val = self.fc_val(x) 30 | 31 | val = val.view(-1, 1, num_support) 32 | adv = adv.view(-1, self.num_outputs, num_support) 33 | z = val + (adv - adv.mean(1, keepdim=True)) 34 | z = z.view(-1, self.num_outputs, num_support) 35 | p = nn.Softmax(dim=2)(z) 36 | return p 37 | ``` 38 | 39 | ## Double + Distributional 40 | 41 | oneline_net 에서 action을 구합니다. 42 | 43 | ```python 44 | z_space = online_net.z.repeat(batch_size, online_net.num_outputs, 1) 45 | prob_next_states_online = online_net(next_states) 46 | prob_next_states_target = target_net(next_states) 47 | Q_next_state = torch.sum(prob_next_states_online * z_space, 2) 48 | next_actions = torch.argmax(Q_next_state, 1) 49 | prob_next_states_action = torch.stack([prob_next_states_target[i, action, :] for i, action in enumerate(next_actions)]) 50 | 51 | ``` 52 | 53 | ## PER + Distributional 54 | 55 | 그 전의 PER 에서는 td_error 의 절댓 값의 비율이 경험의 중요성이 되었습니다. 56 | 57 | 여기서는 Distributional 의 Loss값을 경험의 중요성이라고 고려합니다. 58 | 59 | ```python 60 | td_error = QNet.get_loss(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask) 61 | ``` 62 | 63 | 64 | 65 | ```python 66 | 67 | @classmethod 68 | def get_loss(cls, oneline_net, target_net, states, next_states, actions, rewards, masks): 69 | states = torch.stack(states) 70 | next_states = torch.stack(next_states) 71 | actions = torch.Tensor(actions).int() 72 | rewards = torch.Tensor(rewards) 73 | masks = torch.Tensor(masks) 74 | 75 | z_space = oneline_net.z.repeat(batch_size, oneline_net.num_outputs, 1) 76 | prob_next_states = oneline_net(next_states) 77 | Q_next_state = torch.sum(prob_next_states * z_space, 2) 78 | next_actions = torch.argmax(Q_next_state, 1) 79 | prob_next_states_action = torch.stack([prob_next_states[i, action, :] for i, action in enumerate(next_actions)]) 80 | 81 | m_prob = cls.get_m(rewards, masks, prob_next_states_action) 82 | m_prob = torch.tensor(m_prob) 83 | 84 | m_prob = m_prob / torch.sum(m_prob, dim=1, keepdim=True) 85 | expand_dim_action = torch.unsqueeze(actions, -1) 86 | p = torch.sum(oneline_net(states) * expand_dim_action.float(), dim=1) 87 | loss = -torch.sum(m_prob * torch.log(p + 1e-20), 1) 88 | 89 | return loss 90 | ``` 91 | 92 | 나머지는 Distributional 이 없는 버전과 동일합니다. 93 | 94 | -------------------------------------------------------------------------------- /rainbow/9-Rainbow/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | env_name = 'CartPole-v1' 4 | gamma = 0.99 5 | batch_size = 32 6 | lr = 0.0001 7 | initial_exploration = 1000 8 | goal_score = 200 9 | log_interval = 10 10 | update_target = 100 11 | replay_memory_capacity = 1000 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | # Multi_Step 15 | n_step = 1 16 | 17 | # PER 18 | small_epsilon = 0.0001 19 | alpha = 1 20 | beta_start = 0.1 21 | 22 | # Noisy Net 23 | sigma_zero = 0.5 24 | 25 | # Distributional 26 | num_support = 8 27 | V_max = 5 28 | V_min = -5 29 | -------------------------------------------------------------------------------- /rainbow/9-Rainbow/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | from collections import namedtuple, deque 5 | from model import QNet 6 | from config import small_epsilon, gamma, alpha, device, n_step 7 | 8 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 9 | 10 | 11 | class Memory(object): 12 | def __init__(self, capacity): 13 | self.memory = [] 14 | self.memory_probabiliy = [] 15 | self.capacity = capacity 16 | self.position = 0 17 | self.reset_local() 18 | 19 | def reset_local(self): 20 | self.local_step = 0 21 | self.local_state = None 22 | self.local_action = None 23 | self.local_rewards = [] 24 | 25 | def push(self, state, next_state, action, reward, mask): 26 | self.local_step += 1 27 | self.local_rewards.append(reward) 28 | if self.local_step == 1: 29 | self.local_state = state 30 | self.local_action = action 31 | if self.local_step == n_step: 32 | reward = 0 33 | for idx, local_reward in enumerate(self.local_rewards): 34 | reward += (gamma ** idx) * local_reward 35 | self.push_to_memory(self.local_state, next_state, self.local_action, reward, mask) 36 | self.reset_local() 37 | if mask == 0: 38 | self.reset_local() 39 | 40 | 41 | def push_to_memory(self, state, next_state, action, reward, mask): 42 | if len(self.memory) > 0: 43 | max_probability = max(self.memory_probabiliy) 44 | else: 45 | max_probability = small_epsilon 46 | 47 | if len(self.memory) < self.capacity: 48 | self.memory.append(Transition(state, next_state, action, reward, mask)) 49 | self.memory_probabiliy.append(max_probability) 50 | else: 51 | self.memory[self.position] = Transition(state, next_state, action, reward, mask) 52 | self.memory_probabiliy[self.position] = max_probability 53 | 54 | self.position = (self.position + 1) % self.capacity 55 | 56 | def sample(self, batch_size, net, target_net, beta): 57 | probability_sum = sum(self.memory_probabiliy) 58 | p = [probability / probability_sum for probability in self.memory_probabiliy] 59 | 60 | indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p) 61 | transitions = [self.memory[idx] for idx in indexes] 62 | transitions_p = [p[idx] for idx in indexes] 63 | batch = Transition(*zip(*transitions)) 64 | 65 | weights = [pow(self.capacity * p_j, -beta) for p_j in transitions_p] 66 | weights = torch.Tensor(weights).to(device) 67 | weights = weights / weights.max() 68 | 69 | td_error = QNet.get_loss(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask) 70 | td_error = td_error.detach() 71 | 72 | td_error_idx = 0 73 | for idx in indexes: 74 | self.memory_probabiliy[idx] = pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item() 75 | td_error_idx += 1 76 | 77 | 78 | return batch, weights 79 | 80 | def __len__(self): 81 | return len(self.memory) 82 | -------------------------------------------------------------------------------- /rainbow/9-Rainbow/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from tensorboardX import SummaryWriter 12 | 13 | from model import QNet 14 | from memory import Memory 15 | 16 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, beta_start 17 | 18 | 19 | def get_action(state, target_net, epsilon, env): 20 | if np.random.rand() <= epsilon: 21 | return env.action_space.sample() 22 | else: 23 | return target_net.get_action(state) 24 | 25 | def update_target_model(online_net, target_net): 26 | # Target <- Net 27 | target_net.load_state_dict(online_net.state_dict()) 28 | 29 | 30 | 31 | def main(): 32 | env = gym.make(env_name) 33 | env.seed(500) 34 | torch.manual_seed(500) 35 | 36 | num_inputs = env.observation_space.shape[0] 37 | num_actions = env.action_space.n 38 | print('state size:', num_inputs) 39 | print('action size:', num_actions) 40 | 41 | online_net = QNet(num_inputs, num_actions) 42 | target_net = QNet(num_inputs, num_actions) 43 | update_target_model(online_net, target_net) 44 | 45 | optimizer = optim.Adam(online_net.parameters(), lr=lr) 46 | writer = SummaryWriter('logs') 47 | 48 | online_net.to(device) 49 | target_net.to(device) 50 | online_net.train() 51 | target_net.train() 52 | memory = Memory(replay_memory_capacity) 53 | running_score = 0 54 | epsilon = 1.0 55 | steps = 0 56 | beta = beta_start 57 | loss = 0 58 | 59 | for e in range(3000): 60 | done = False 61 | 62 | score = 0 63 | state = env.reset() 64 | state = torch.Tensor(state) 65 | state = state.unsqueeze(0) 66 | 67 | while not done: 68 | steps += 1 69 | action = get_action(state, target_net, epsilon, env) 70 | next_state, reward, done, _ = env.step(action) 71 | 72 | next_state = torch.Tensor(next_state) 73 | next_state = next_state.unsqueeze(0) 74 | 75 | mask = 0 if done else 1 76 | reward = reward if not done or score == 499 else -1 77 | action_one_hot = np.zeros(2) 78 | action_one_hot[action] = 1 79 | memory.push(state, next_state, action_one_hot, reward, mask) 80 | 81 | score += reward 82 | state = next_state 83 | 84 | if steps > initial_exploration: 85 | epsilon -= 0.00005 86 | epsilon = max(epsilon, 0.1) 87 | beta += 0.00005 88 | beta = min(1, beta) 89 | 90 | batch, weights = memory.sample(batch_size, online_net, target_net, beta) 91 | loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) 92 | 93 | if steps % update_target == 0: 94 | update_target_model(online_net, target_net) 95 | 96 | score = score if score == 500.0 else score + 1 97 | running_score = 0.99 * running_score + 0.01 * score 98 | if e % log_interval == 0: 99 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 100 | e, running_score, epsilon)) 101 | writer.add_scalar('log/score', float(running_score), e) 102 | writer.add_scalar('log/loss', float(loss), e) 103 | 104 | if running_score > goal_score: 105 | break 106 | 107 | if __name__=="__main__": 108 | main() 109 | --------------------------------------------------------------------------------