├── .gitignore
├── .pylintrc
├── LICENSE
├── PG
    ├── 1-REINFORCE
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
    ├── 2-Actor-Critic
    │   ├── config.py
    │   ├── model.py
    │   └── train.py
    ├── 3-Advantage-Actor-Critic
    │   ├── config.py
    │   ├── model.py
    │   └── train.py
    ├── 4-GAE
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
    ├── 5-TNPG
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
    ├── 6-TRPO
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
    └── 7-PPO
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
├── POMDP
    ├── 0-DQN
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
    ├── 1-DRQN
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
    ├── 2-DRQN-Stack
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
    ├── 3-DRQN-Store-State
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
    └── 4-R2D2-Single
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
├── README.md
├── distributional
    ├── 1-QR-DQN
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
    └── 2-IQN
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   └── train.py
├── parallel
    ├── 1-Async-Q-Learning
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   ├── shared_adam.py
    │   ├── train.py
    │   └── worker.py
    ├── 2-A3C
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   ├── shared_adam.py
    │   ├── train.py
    │   └── worker.py
    ├── 3-ACER
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   ├── shared_adam.py
    │   ├── train.py
    │   └── worker.py
    └── 5-ApeX
    │   ├── config.py
    │   ├── memory.py
    │   ├── model.py
    │   ├── train.py
    │   └── worker.py
└── rainbow
    ├── 1-dqn
        ├── config.py
        ├── memory.py
        ├── model.py
        └── train.py
    ├── 2-DoubleDQN
        ├── README-KR.md
        ├── config.py
        ├── memory.py
        ├── model.py
        └── train.py
    ├── 3-DuelDQN
        ├── README-KR.md
        ├── Screenshot2018-11-1519-f88e4bf3-f581-4b24-a07e-af467a4bba64.14.23.png
        ├── config.py
        ├── memory.py
        ├── model.py
        └── train.py
    ├── 4-multistep
        ├── config.py
        ├── memory.py
        ├── model.py
        └── train.py
    ├── 5-per
        ├── README-KR.md
        ├── Screenshot2018-11-1514-a431e580-fd9d-4a07-afd1-5f80e0042c23.45.16.png
        ├── config.py
        ├── memory.py
        ├── model.py
        └── train.py
    ├── 6-Nosiy_net
        ├── README-KR.md
        ├── Screenshot2018-11-1616-fd936286-4e40-4962-99ff-1ddd3b7deeb8.36.21.png
        ├── config.py
        ├── memory.py
        ├── model.py
        └── train.py
    ├── 7-distributional_c51
        ├── config.py
        ├── memory.py
        ├── model.py
        └── train.py
    ├── 8-Not_Distributional
        ├── README-KR.md
        ├── config.py
        ├── memory.py
        ├── model.py
        └── train.py
    └── 9-Rainbow
        ├── README-KR.md
        ├── config.py
        ├── memory.py
        ├── model.py
        └── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/macos,python
  3 | # Edit at https://www.gitignore.io/?templates=macos,python
  4 | 
  5 | ### macOS ###
  6 | # General
  7 | .DS_Store
  8 | .AppleDouble
  9 | .LSOverride
 10 | 
 11 | # Icon must end with two \r
 12 | Icon
 13 | 
 14 | # Thumbnails
 15 | ._*
 16 | 
 17 | # Files that might appear in the root of a volume
 18 | .DocumentRevisions-V100
 19 | .fseventsd
 20 | .Spotlight-V100
 21 | .TemporaryItems
 22 | .Trashes
 23 | .VolumeIcon.icns
 24 | .com.apple.timemachine.donotpresent
 25 | 
 26 | # Directories potentially created on remote AFP share
 27 | .AppleDB
 28 | .AppleDesktop
 29 | Network Trash Folder
 30 | Temporary Items
 31 | .apdisk
 32 | 
 33 | ### Python ###
 34 | # Byte-compiled / optimized / DLL files
 35 | __pycache__/
 36 | *.py[cod]
 37 | *$py.class
 38 | logs/
 39 | 
 40 | # C extensions
 41 | *.so
 42 | 
 43 | # Distribution / packaging
 44 | .Python
 45 | build/
 46 | develop-eggs/
 47 | dist/
 48 | downloads/
 49 | eggs/
 50 | .eggs/
 51 | lib/
 52 | lib64/
 53 | parts/
 54 | sdist/
 55 | var/
 56 | wheels/
 57 | *.egg-info/
 58 | .installed.cfg
 59 | *.egg
 60 | MANIFEST
 61 | 
 62 | # PyInstaller
 63 | #  Usually these files are written by a python script from a template
 64 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 65 | *.manifest
 66 | *.spec
 67 | 
 68 | # Installer logs
 69 | pip-log.txt
 70 | pip-delete-this-directory.txt
 71 | 
 72 | # Unit test / coverage reports
 73 | htmlcov/
 74 | .tox/
 75 | .nox/
 76 | .coverage
 77 | .coverage.*
 78 | .cache
 79 | nosetests.xml
 80 | coverage.xml
 81 | *.cover
 82 | .hypothesis/
 83 | .pytest_cache/
 84 | 
 85 | # Translations
 86 | *.mo
 87 | *.pot
 88 | 
 89 | # Django stuff:
 90 | *.log
 91 | local_settings.py
 92 | db.sqlite3
 93 | 
 94 | # Flask stuff:
 95 | instance/
 96 | .webassets-cache
 97 | 
 98 | # Scrapy stuff:
 99 | .scrapy
100 | 
101 | # Sphinx documentation
102 | docs/_build/
103 | 
104 | # PyBuilder
105 | target/
106 | 
107 | # Jupyter Notebook
108 | .ipynb_checkpoints
109 | 
110 | # IPython
111 | profile_default/
112 | ipython_config.py
113 | 
114 | # pyenv
115 | .python-version
116 | 
117 | # celery beat schedule file
118 | celerybeat-schedule
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | ### Python Patch ###
151 | .venv/
152 | 
153 | ### Python.VirtualEnv Stack ###
154 | # Virtualenv
155 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
156 | [Bb]in
157 | [Ii]nclude
158 | [Ll]ib
159 | [Ll]ib64
160 | [Ll]ocal
161 | [Ss]cripts
162 | pyvenv.cfg
163 | pip-selfcheck.json
164 | 
165 | # End of https://www.gitignore.io/api/macos,python
166 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Cheol Kang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PG/1-REINFORCE/config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | env_name = 'CartPole-v1'
4 | gamma = 0.99
5 | lr = 0.001
6 | goal_score = 200
7 | log_interval = 10
8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9 | 


--------------------------------------------------------------------------------
/PG/1-REINFORCE/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 5 | 
 6 | class Memory(object):
 7 |     def __init__(self):
 8 |         self.memory = deque()
 9 | 
10 |     def push(self, state, next_state, action, reward, mask):
11 |         self.memory.append(Transition(state, next_state, action, reward, mask))
12 | 
13 |     def sample(self):
14 |         memory = self.memory
15 |         return Transition(*zip(*memory)) 
16 | 
17 |     def __len__(self):
18 |         return len(self.memory)
19 | 


--------------------------------------------------------------------------------
/PG/1-REINFORCE/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | from config import gamma
 7 | class QNet(nn.Module):
 8 |     def __init__(self, num_inputs, num_outputs):
 9 |         super(QNet, self).__init__()
10 |         self.num_inputs = num_inputs
11 |         self.num_outputs = num_outputs
12 | 
13 |         self.fc_1 = nn.Linear(num_inputs, 128)
14 |         self.fc_2 = nn.Linear(128, num_outputs)
15 | 
16 |         for m in self.modules():
17 |             if isinstance(m, nn.Linear):
18 |                 nn.init.xavier_uniform(m.weight)
19 | 
20 |     def forward(self, input):
21 |         x = F.relu(self.fc_1(input))
22 |         policy = F.softmax(self.fc_2(x))
23 |         return policy
24 | 
25 |     @classmethod
26 |     def train_model(cls, net, transitions, optimizer):
27 |         states, actions, rewards, masks = transitions.state, transitions.action, transitions.reward, transitions.mask
28 | 
29 |         states = torch.stack(states)
30 |         actions = torch.stack(actions)
31 |         rewards = torch.Tensor(rewards)
32 |         masks = torch.Tensor(masks)
33 | 
34 |         returns = torch.zeros_like(rewards)
35 | 
36 |         running_return = 0
37 |         for t in reversed(range(len(rewards))):
38 |             running_return = rewards[t] + gamma * running_return * masks[t]
39 |             returns[t] = running_return
40 |         
41 |         policies = net(states)
42 |         policies = policies.view(-1, net.num_outputs)
43 | 
44 |         log_policies = (torch.log(policies) * actions.detach()).sum(dim=1)
45 | 
46 |         loss = (-log_policies * returns).sum()
47 |         
48 |         optimizer.zero_grad()
49 |         loss.backward()
50 |         optimizer.step()
51 |         
52 |         return loss
53 | 
54 |     def get_action(self, input):
55 |         policy = self.forward(input)
56 |         policy = policy[0].data.numpy()
57 | 
58 |         action = np.random.choice(self.num_outputs, 1, p=policy)[0]
59 |         return action
60 | 


--------------------------------------------------------------------------------
/PG/1-REINFORCE/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import gym
 4 | import random
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.optim as optim
 9 | import torch.nn.functional as F
10 | from model import QNet
11 | from tensorboardX import SummaryWriter
12 | 
13 | from memory import Memory
14 | from config import env_name, goal_score, log_interval, device, lr, gamma
15 | 
16 | 
17 | def main():
18 |     env = gym.make(env_name)
19 |     env.seed(500)
20 |     torch.manual_seed(500)
21 | 
22 |     num_inputs = env.observation_space.shape[0]
23 |     num_actions = env.action_space.n
24 |     print('state size:', num_inputs)
25 |     print('action size:', num_actions)
26 | 
27 |     net = QNet(num_inputs, num_actions)
28 | 
29 |     optimizer = optim.Adam(net.parameters(), lr=lr)
30 |     writer = SummaryWriter('logs')
31 | 
32 |     net.to(device)
33 |     net.train()
34 |     running_score = 0
35 |     steps = 0
36 |     loss = 0
37 | 
38 |     for e in range(3000):
39 |         done = False
40 |         memory = Memory()
41 | 
42 |         score = 0
43 |         state = env.reset()
44 |         state = torch.Tensor(state).to(device)
45 |         state = state.unsqueeze(0)
46 | 
47 |         while not done:
48 |             steps += 1
49 | 
50 |             action = net.get_action(state)
51 |             next_state, reward, done, _ = env.step(action)
52 | 
53 |             next_state = torch.Tensor(next_state)
54 |             next_state = next_state.unsqueeze(0)
55 | 
56 |             mask = 0 if done else 1
57 |             reward = reward if not done or score == 499 else -1
58 | 
59 |             action_one_hot = torch.zeros(2)
60 |             action_one_hot[action] = 1
61 |             memory.push(state, next_state, action_one_hot, reward, mask)
62 |     
63 |             score += reward
64 |             state = next_state
65 | 
66 |         loss = QNet.train_model(net, memory.sample(), optimizer)
67 |             
68 | 
69 |         score = score if score == 500.0 else score + 1
70 |         running_score = 0.99 * running_score + 0.01 * score
71 |         if e % log_interval == 0:
72 |             print('{} episode | score: {:.2f}'.format(
73 |                 e, running_score))
74 |             writer.add_scalar('log/score', float(running_score), e)
75 |             writer.add_scalar('log/loss', float(loss), e)
76 | 
77 |         if running_score > goal_score:
78 |             break
79 | 
80 | 
81 | if __name__=="__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/PG/2-Actor-Critic/config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | env_name = 'CartPole-v1'
4 | gamma = 0.99
5 | lr = 0.0001
6 | goal_score = 200
7 | log_interval = 10
8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9 | 


--------------------------------------------------------------------------------
/PG/2-Actor-Critic/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | from config import gamma
 7 | class QNet(nn.Module):
 8 |     def __init__(self, num_inputs, num_outputs):
 9 |         super(QNet, self).__init__()
10 |         self.num_inputs = num_inputs
11 |         self.num_outputs = num_outputs
12 | 
13 |         self.fc = nn.Linear(num_inputs, 128)
14 |         self.fc_actor = nn.Linear(128, num_outputs)
15 |         self.fc_critic = nn.Linear(128, num_outputs)
16 | 
17 |         for m in self.modules():
18 |             if isinstance(m, nn.Linear):
19 |                 nn.init.xavier_uniform(m.weight)
20 | 
21 |     def forward(self, input):
22 |         x = F.relu(self.fc(input))
23 |         policy = F.softmax(self.fc_actor(x))
24 |         q_value = self.fc_critic(x)
25 |         return policy, q_value
26 | 
27 |     @classmethod
28 |     def train_model(cls, net, optimizer, transition):
29 |         state, next_state, action, reward, mask = transition
30 | 
31 |         policy, q_value = net(state)
32 |         policy, q_value = policy.view(-1, net.num_outputs), q_value.view(-1, net.num_outputs)
33 |         _, next_q_value = net(next_state)
34 |         next_q_value = next_q_value.view(-1, net.num_outputs)
35 |         next_action = net.get_action(next_state)
36 | 
37 | 
38 |         target = reward + mask * gamma * next_q_value[0][next_action]
39 | 
40 |         log_policy = torch.log(policy[0])[action]
41 |         loss_policy = - log_policy * q_value[0][action].item()
42 |         loss_value = F.mse_loss(q_value[0][action], target.detach())
43 | 
44 |         loss = loss_policy + loss_value
45 |         optimizer.zero_grad()
46 |         loss.backward()
47 |         optimizer.step()
48 | 
49 |         return loss
50 | 
51 |     def get_action(self, input):
52 |         policy, _ = self.forward(input)
53 |         policy = policy[0].data.numpy()
54 | 
55 |         action = np.random.choice(self.num_outputs, 1, p=policy)[0]
56 |         return action
57 | 


--------------------------------------------------------------------------------
/PG/2-Actor-Critic/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import gym
 4 | import random
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.optim as optim
 9 | import torch.nn.functional as F
10 | from model import QNet
11 | from tensorboardX import SummaryWriter
12 | 
13 | from config import env_name, goal_score, log_interval, device, lr
14 | 
15 | 
16 | def main():
17 |     env = gym.make(env_name)
18 |     env.seed(500)
19 |     torch.manual_seed(500)
20 | 
21 |     num_inputs = env.observation_space.shape[0]
22 |     num_actions = env.action_space.n
23 |     print('state size:', num_inputs)
24 |     print('action size:', num_actions)
25 | 
26 |     net = QNet(num_inputs, num_actions)
27 | 
28 |     optimizer = optim.Adam(net.parameters(), lr=lr)
29 |     writer = SummaryWriter('logs')
30 | 
31 |     net.to(device)
32 |     net.train()
33 |     running_score = 0
34 |     steps = 0
35 |     loss = 0
36 | 
37 |     for e in range(3000):
38 |         done = False
39 | 
40 |         score = 0
41 |         state = env.reset()
42 |         state = torch.Tensor(state).to(device)
43 |         state = state.unsqueeze(0)
44 | 
45 |         while not done:
46 |             steps += 1
47 | 
48 |             action = net.get_action(state)
49 |             next_state, reward, done, _ = env.step(action)
50 | 
51 |             next_state = torch.Tensor(next_state)
52 |             next_state = next_state.unsqueeze(0)
53 | 
54 |             mask = 0 if done else 1
55 |             reward = reward if not done or score == 499 else -1
56 |             transition = [state, next_state, action, reward, mask]
57 | 
58 |             score += reward
59 |             state = next_state
60 | 
61 |             loss = QNet.train_model(net, optimizer, transition)
62 | 
63 |         score = score if score == 500.0 else score + 1
64 |         running_score = 0.99 * running_score + 0.01 * score
65 |         if e % log_interval == 0:
66 |             print('{} episode | score: {:.2f}'.format(
67 |                 e, running_score))
68 |             writer.add_scalar('log/score', float(running_score), e)
69 |             writer.add_scalar('log/loss', float(loss), e)
70 | 
71 |         if running_score > goal_score:
72 |             break
73 | 
74 | 
75 | if __name__=="__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/PG/3-Advantage-Actor-Critic/config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | env_name = 'CartPole-v1'
4 | gamma = 0.99
5 | lr = 0.001
6 | goal_score = 200
7 | log_interval = 10
8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9 | 


--------------------------------------------------------------------------------
/PG/3-Advantage-Actor-Critic/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | from config import gamma
 7 | class QNet(nn.Module):
 8 |     def __init__(self, num_inputs, num_outputs):
 9 |         super(QNet, self).__init__()
10 |         self.num_inputs = num_inputs
11 |         self.num_outputs = num_outputs
12 | 
13 |         self.fc = nn.Linear(num_inputs, 128)
14 |         self.fc_actor = nn.Linear(128, num_outputs)
15 |         self.fc_critic = nn.Linear(128, 1)
16 | 
17 |         for m in self.modules():
18 |             if isinstance(m, nn.Linear):
19 |                 nn.init.xavier_uniform(m.weight)
20 | 
21 |     def forward(self, input):
22 |         x = F.relu(self.fc(input))
23 |         policy = F.softmax(self.fc_actor(x))
24 |         value = self.fc_critic(x)
25 |         return policy, value
26 | 
27 |     @classmethod
28 |     def train_model(cls, net, optimizer, transition):
29 |         state, next_state, action, reward, mask = transition
30 | 
31 |         policy, value = net(state)
32 |         policy, value = policy.view(-1, net.num_outputs), value.view(-1)
33 |         _, next_value = net(next_state)
34 |         next_value = next_value.view(-1)
35 | 
36 |         target = reward + mask * gamma * next_value[0]
37 |         td_error = target - value[0]
38 | 
39 |         log_policy = torch.log(policy[0])[action]
40 |         loss_policy = - log_policy * td_error.item()
41 |         loss_value = F.mse_loss(value[0], target.detach())
42 |         entropy = torch.log(policy[0]) * policy[0]
43 | 
44 |         loss = loss_policy + loss_value - 0.1 * entropy.sum()
45 |         optimizer.zero_grad()
46 |         loss.backward()
47 |         optimizer.step()
48 | 
49 |         return loss
50 | 
51 |     def get_action(self, input):
52 |         policy, _ = self.forward(input)
53 |         policy = policy[0].data.numpy()
54 | 
55 |         action = np.random.choice(self.num_outputs, 1, p=policy)[0]
56 |         return action
57 | 


--------------------------------------------------------------------------------
/PG/3-Advantage-Actor-Critic/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import gym
 4 | import random
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.optim as optim
 9 | import torch.nn.functional as F
10 | from model import QNet
11 | from tensorboardX import SummaryWriter
12 | 
13 | from config import env_name, goal_score, log_interval, device, lr
14 | 
15 | 
16 | def main():
17 |     env = gym.make(env_name)
18 |     env.seed(500)
19 |     torch.manual_seed(500)
20 | 
21 |     num_inputs = env.observation_space.shape[0]
22 |     num_actions = env.action_space.n
23 |     print('state size:', num_inputs)
24 |     print('action size:', num_actions)
25 | 
26 |     net = QNet(num_inputs, num_actions)
27 | 
28 |     optimizer = optim.Adam(net.parameters(), lr=lr)
29 |     writer = SummaryWriter('logs')
30 | 
31 |     net.to(device)
32 |     net.train()
33 |     running_score = 0
34 |     steps = 0
35 |     loss = 0
36 | 
37 |     for e in range(3000):
38 |         done = False
39 | 
40 |         score = 0
41 |         state = env.reset()
42 |         state = torch.Tensor(state).to(device)
43 |         state = state.unsqueeze(0)
44 | 
45 |         while not done:
46 |             steps += 1
47 | 
48 |             action = net.get_action(state)
49 |             next_state, reward, done, _ = env.step(action)
50 | 
51 |             next_state = torch.Tensor(next_state)
52 |             next_state = next_state.unsqueeze(0)
53 | 
54 |             mask = 0 if done else 1
55 |             reward = reward if not done or score == 499 else -1
56 |             transition = [state, next_state, action, reward, mask]
57 | 
58 |             score += reward
59 |             state = next_state
60 | 
61 |             loss = QNet.train_model(net, optimizer, transition)
62 | 
63 |         score = score if score == 500.0 else score + 1
64 |         running_score = 0.99 * running_score + 0.01 * score
65 |         if e % log_interval == 0:
66 |             print('{} episode | score: {:.2f}'.format(
67 |                 e, running_score))
68 |             writer.add_scalar('log/score', float(running_score), e)
69 |             writer.add_scalar('log/loss', float(loss), e)
70 | 
71 |         if running_score > goal_score:
72 |             break
73 | 
74 | 
75 | if __name__=="__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/PG/4-GAE/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | lambda_gae = 0.96
 6 | lr = 0.0001
 7 | goal_score = 200
 8 | log_interval = 10
 9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10 | 
11 | ciritic_coefficient = 0.5 
12 | entropy_coefficient = 0.01


--------------------------------------------------------------------------------
/PG/4-GAE/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 5 | 
 6 | class Memory(object):
 7 |     def __init__(self):
 8 |         self.memory = deque()
 9 | 
10 |     def push(self, state, next_state, action, reward, mask):
11 |         self.memory.append(Transition(state, next_state, action, reward, mask))
12 | 
13 |     def sample(self):
14 |         memory = self.memory
15 |         return Transition(*zip(*memory)) 
16 | 
17 |     def __len__(self):
18 |         return len(self.memory)
19 | 


--------------------------------------------------------------------------------
/PG/4-GAE/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | from config import gamma, lambda_gae, ciritic_coefficient, entropy_coefficient
 7 | 
 8 | from collections import namedtuple
 9 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask', 'value', 'return_value', 'advantage'))
10 | 
11 | class GAE(nn.Module):
12 |     def __init__(self, num_inputs, num_outputs):
13 |         super(GAE, self).__init__()
14 |         self.num_inputs = num_inputs
15 |         self.num_outputs = num_outputs
16 | 
17 |         self.fc = nn.Linear(num_inputs, 128)
18 |         self.fc_actor = nn.Linear(128, num_outputs)
19 |         self.fc_critic = nn.Linear(128, 1)
20 | 
21 |         for m in self.modules():
22 |             if isinstance(m, nn.Linear):
23 |                 nn.init.xavier_uniform(m.weight)
24 | 
25 |     def forward(self, input):
26 |         x = F.relu(self.fc(input))
27 |         policy = F.softmax(self.fc_actor(x))
28 |         value = self.fc_critic(x)
29 |         return policy, value
30 | 
31 |     @classmethod
32 |     def get_gae(self, values, rewards, masks):
33 |         returns = torch.zeros_like(rewards)
34 |         advantages = torch.zeros_like(rewards)
35 | 
36 |         running_return = 0
37 |         previous_value = 0
38 |         running_advantage = 0
39 | 
40 |         for t in reversed(range(len(rewards))):
41 |             running_return = rewards[t] + gamma * running_return * masks[t]
42 |             running_tderror = rewards[t] + gamma * previous_value * masks[t] - values.data[t]
43 |             running_advantage = running_tderror + (gamma * lambda_gae) * running_advantage * masks[t]
44 | 
45 |             returns[t] = running_return
46 |             previous_value = values.data[t]
47 |             advantages[t] = running_advantage
48 | 
49 |         return returns, advantages
50 | 
51 |     @classmethod
52 |     def train_model(cls, net, transitions, optimizer):
53 |         states, actions, rewards, masks = transitions.state, transitions.action, transitions.reward, transitions.mask
54 | 
55 |         states = torch.stack(states)
56 |         actions = torch.stack(actions)
57 |         rewards = torch.Tensor(rewards)
58 |         masks = torch.Tensor(masks)
59 | 
60 |         policies, values = net(states)
61 |         policies = policies.view(-1, net.num_outputs)
62 |         values = values.view(-1)
63 | 
64 |         returns, advantages = net.get_gae(values.view(-1).detach(), rewards, masks)
65 | 
66 |         log_policies = (torch.log(policies) * actions.detach()).sum(dim=1)
67 |         actor_loss = -(log_policies * advantages).sum()
68 |         critic_loss = (returns.detach() - values).pow(2).sum()
69 |         
70 |         entropy = (torch.log(policies) * policies).sum(1).sum()
71 | 
72 |         loss = actor_loss + ciritic_coefficient * critic_loss - entropy_coefficient * entropy
73 |         optimizer.zero_grad()
74 |         loss.backward()
75 |         optimizer.step()
76 | 
77 |         return loss
78 | 
79 |     def get_action(self, input):
80 |         policy, _ = self.forward(input)
81 |         policy = policy[0].data.numpy()
82 | 
83 |         action = np.random.choice(self.num_outputs, 1, p=policy)[0]
84 |         return action
85 | 


--------------------------------------------------------------------------------
/PG/4-GAE/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import gym
 4 | import random
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.optim as optim
 9 | import torch.nn.functional as F
10 | from model import GAE
11 | from tensorboardX import SummaryWriter
12 | 
13 | from memory import Memory
14 | from config import env_name, goal_score, log_interval, device, lr, gamma
15 | 
16 | 
17 | def main():
18 |     env = gym.make(env_name)
19 |     env.seed(500)
20 |     torch.manual_seed(500)
21 | 
22 |     num_inputs = env.observation_space.shape[0]
23 |     num_actions = env.action_space.n
24 |     print('state size:', num_inputs)
25 |     print('action size:', num_actions)
26 | 
27 |     net = GAE(num_inputs, num_actions)
28 | 
29 |     optimizer = optim.Adam(net.parameters(), lr=lr)
30 |     writer = SummaryWriter('logs')
31 | 
32 |     net.to(device)
33 |     net.train()
34 |     running_score = 0
35 |     steps = 0
36 |     loss = 0
37 | 
38 |     for e in range(30000):
39 |         done = False
40 |         memory = Memory() 
41 | 
42 |         score = 0
43 |         state = env.reset()
44 |         state = torch.Tensor(state).to(device)
45 |         state = state.unsqueeze(0)
46 | 
47 |         while not done:
48 |             steps += 1
49 | 
50 |             action = net.get_action(state)
51 |             next_state, reward, done, _ = env.step(action)
52 | 
53 |             next_state = torch.Tensor(next_state)
54 |             next_state = next_state.unsqueeze(0)
55 | 
56 |             mask = 0 if done else 1
57 |             reward = reward if not done or score == 499 else -1
58 | 
59 |             action_one_hot = torch.zeros(2)
60 |             action_one_hot[action] = 1
61 |             memory.push(state, next_state, action_one_hot, reward, mask)
62 | 
63 |             score += reward
64 |             state = next_state
65 | 
66 |         loss = GAE.train_model(net, memory.sample(), optimizer)
67 | 
68 |         score = score if score == 500.0 else score + 1
69 |         running_score = 0.99 * running_score + 0.01 * score
70 |         if e % log_interval == 0:
71 |             print('{} episode | score: {:.2f}'.format(
72 |                 e, running_score))
73 |             writer.add_scalar('log/score', float(running_score), e)
74 |             writer.add_scalar('log/loss', float(loss), e)
75 | 
76 |         if running_score > goal_score:
77 |             break
78 | 
79 | 
80 | if __name__=="__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/PG/5-TNPG/config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | env_name = 'CartPole-v1'
4 | gamma = 0.99
5 | lr = 0.001
6 | goal_score = 200
7 | log_interval = 10
8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9 | 


--------------------------------------------------------------------------------
/PG/5-TNPG/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 5 | 
 6 | class Memory(object):
 7 |     def __init__(self):
 8 |         self.memory = deque()
 9 | 
10 |     def push(self, state, next_state, action, reward, mask):
11 |         self.memory.append(Transition(state, next_state, action, reward, mask))
12 | 
13 |     def sample(self):
14 |         memory = self.memory
15 |         return Transition(*zip(*memory)) 
16 | 
17 |     def __len__(self):
18 |         return len(self.memory)
19 | 


--------------------------------------------------------------------------------
/PG/5-TNPG/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import gym
 4 | import random
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.optim as optim
 9 | import torch.nn.functional as F
10 | from model import TNPG
11 | from tensorboardX import SummaryWriter
12 | 
13 | from memory import Memory
14 | from config import env_name, goal_score, log_interval, device, lr, gamma
15 | 
16 | 
17 | def main():
18 |     env = gym.make(env_name)
19 |     env.seed(500)
20 |     torch.manual_seed(500)
21 | 
22 |     num_inputs = env.observation_space.shape[0]
23 |     num_actions = env.action_space.n
24 |     print('state size:', num_inputs)
25 |     print('action size:', num_actions)
26 | 
27 |     net = TNPG(num_inputs, num_actions)
28 |     writer = SummaryWriter('logs')
29 | 
30 |     net.to(device)
31 |     net.train()
32 |     running_score = 0
33 |     steps = 0
34 |     loss = 0
35 |     for e in range(30000):
36 |         done = False
37 |         memory = Memory()
38 | 
39 |         score = 0
40 |         state = env.reset()
41 |         state = torch.Tensor(state).to(device)
42 |         state = state.unsqueeze(0)
43 | 
44 |         while not done:
45 |             steps += 1
46 | 
47 |             action = net.get_action(state)
48 |             next_state, reward, done, _ = env.step(action)
49 | 
50 |             next_state = torch.Tensor(next_state)
51 |             next_state = next_state.unsqueeze(0)
52 | 
53 |             mask = 0 if done else 1
54 |             reward = reward if not done or score == 499 else -1
55 | 
56 |             action_one_hot = torch.zeros(2)
57 |             action_one_hot[action] = 1
58 |             memory.push(state, next_state, action_one_hot, reward, mask)
59 | 
60 |             score += reward
61 |             state = next_state
62 | 
63 |         loss = TNPG.train_model(net, memory.sample())
64 | 
65 |         score = score if score == 500.0 else score + 1
66 |         running_score = 0.99 * running_score + 0.01 * score
67 |         if e % log_interval == 0:
68 |             print('{} episode | score: {:.2f}'.format(
69 |                 e, running_score))
70 |             writer.add_scalar('log/score', float(running_score), e)
71 |             writer.add_scalar('log/loss', float(loss), e)
72 | 
73 |         if running_score > goal_score:
74 |             break
75 | 
76 | 
77 | if __name__=="__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/PG/6-TRPO/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | goal_score = 200
 6 | log_interval = 10
 7 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 8 | 
 9 | max_kl = 0.01
10 | 


--------------------------------------------------------------------------------
/PG/6-TRPO/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 5 | 
 6 | class Memory(object):
 7 |     def __init__(self):
 8 |         self.memory = deque()
 9 | 
10 |     def push(self, state, next_state, action, reward, mask):
11 |         self.memory.append(Transition(state, next_state, action, reward, mask))
12 | 
13 |     def sample(self):
14 |         memory = self.memory
15 |         return Transition(*zip(*memory)) 
16 | 
17 |     def __len__(self):
18 |         return len(self.memory)
19 | 


--------------------------------------------------------------------------------
/PG/6-TRPO/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import gym
 4 | import random
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.optim as optim
 9 | import torch.nn.functional as F
10 | from model import TRPO 
11 | from tensorboardX import SummaryWriter
12 | 
13 | from memory import Memory
14 | from config import env_name, goal_score, log_interval, device, gamma
15 | 
16 | 
17 | def main():
18 |     env = gym.make(env_name)
19 |     env.seed(500)
20 |     torch.manual_seed(500)
21 | 
22 |     num_inputs = env.observation_space.shape[0]
23 |     num_actions = env.action_space.n
24 |     print('state size:', num_inputs)
25 |     print('action size:', num_actions)
26 | 
27 |     net = TRPO(num_inputs, num_actions)
28 |     writer = SummaryWriter('logs')
29 | 
30 |     net.to(device)
31 |     net.train()
32 |     running_score = 0
33 |     steps = 0
34 |     loss = 0
35 |     for e in range(30000):
36 |         done = False
37 |         memory = Memory()
38 | 
39 |         score = 0
40 |         state = env.reset()
41 |         state = torch.Tensor(state).to(device)
42 |         state = state.unsqueeze(0)
43 | 
44 |         while not done:
45 |             steps += 1
46 | 
47 |             action = net.get_action(state)
48 |             next_state, reward, done, _ = env.step(action)
49 | 
50 |             next_state = torch.Tensor(next_state)
51 |             next_state = next_state.unsqueeze(0)
52 | 
53 |             mask = 0 if done else 1
54 |             reward = reward if not done or score == 499 else -1
55 | 
56 |             action_one_hot = torch.zeros(2)
57 |             action_one_hot[action] = 1
58 |             memory.push(state, next_state, action_one_hot, reward, mask)
59 | 
60 |             score += reward
61 |             state = next_state
62 | 
63 |         loss = TRPO.train_model(net, memory.sample())
64 | 
65 |         score = score if score == 500.0 else score + 1
66 |         running_score = 0.99 * running_score + 0.01 * score
67 |         if e % log_interval == 0:
68 |             print('{} episode | score: {:.2f}'.format(
69 |                 e, running_score))
70 |             writer.add_scalar('log/score', float(running_score), e)
71 |             writer.add_scalar('log/loss', float(loss), e)
72 | 
73 |         if running_score > goal_score:
74 |             break
75 | 
76 | 
77 | if __name__=="__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/PG/7-PPO/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | lr = 0.001
 6 | goal_score = 200
 7 | log_interval = 10
 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 9 | 
10 | lambda_gae = 0.96
11 | epsilon_clip = 0.2
12 | ciritic_coefficient = 0.5
13 | entropy_coefficient = 0.01
14 | batch_size = 8
15 | epoch_k = 10
16 | 


--------------------------------------------------------------------------------
/PG/7-PPO/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | from config import batch_size
 4 | 
 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 6 | 
 7 | class Memory(object):
 8 |     def __init__(self):
 9 |         self.memory = deque()
10 | 
11 |     def push(self, state, next_state, action, reward, mask):
12 |         self.memory.append(Transition(state, next_state, action, reward, mask))
13 | 
14 |     def sample(self):
15 |         memory = self.memory
16 |         return Transition(*zip(*memory)) 
17 | 
18 |     def __len__(self):
19 |         return len(self.memory)
20 | 
21 | class BatchMaker():
22 |     def __init__(self, states, actions, returns, advantages, old_policies):
23 |         self.states = states
24 |         self.actions = actions
25 |         self.returns = returns
26 |         self.advantages = advantages
27 |         self.old_policies = old_policies
28 |     
29 |     def sample(self):
30 |         sample_indexes = random.sample(range(len(self.states)), batch_size)
31 |         states_sample = self.states[sample_indexes]
32 |         actions_sample = self.actions[sample_indexes]
33 |         retruns_sample = self.returns[sample_indexes]
34 |         advantages_sample = self.advantages[sample_indexes]
35 |         old_policies_sample = self.old_policies[sample_indexes]
36 |         
37 |         return states_sample, actions_sample, retruns_sample, advantages_sample, old_policies_sample
38 | 
39 | 


--------------------------------------------------------------------------------
/PG/7-PPO/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import random
  6 | 
  7 | from memory import BatchMaker
  8 | from config import gamma, lambda_gae, epsilon_clip, ciritic_coefficient, entropy_coefficient, epoch_k, batch_size
  9 | 
 10 | import warnings
 11 | 
 12 | 
 13 | class PPO(nn.Module):
 14 |     def __init__(self, num_inputs, num_outputs):
 15 |         super(PPO, self).__init__()
 16 |         self.t = 0
 17 |         self.num_inputs = num_inputs
 18 |         self.num_outputs = num_outputs
 19 | 
 20 |         self.fc = nn.Linear(num_inputs, 128)
 21 |         self.fc_actor = nn.Linear(128, num_outputs)
 22 |         self.fc_critic = nn.Linear(128, 1)
 23 | 
 24 |         for m in self.modules():
 25 |             if isinstance(m, nn.Linear):
 26 |                 nn.init.xavier_uniform(m.weight)
 27 | 
 28 |     def forward(self, input):
 29 |         x = torch.relu(self.fc(input))
 30 |         policy = F.softmax(self.fc_actor(x), dim=-1)
 31 |         value = self.fc_critic(x)
 32 |         return policy, value
 33 | 
 34 |     @classmethod
 35 |     def get_gae(self, values, rewards, masks):
 36 |         returns = torch.zeros_like(rewards)
 37 |         advantages = torch.zeros_like(rewards)
 38 | 
 39 |         running_return = 0
 40 |         previous_value = 0
 41 |         running_advantage = 0
 42 | 
 43 |         for t in reversed(range(len(rewards))):
 44 |             running_return = rewards[t] + gamma * running_return * masks[t]
 45 |             running_tderror = rewards[t] + gamma * previous_value * masks[t] - values.data[t]
 46 |             running_advantage = running_tderror + (gamma * lambda_gae) * running_advantage * masks[t]
 47 | 
 48 |             returns[t] = running_return
 49 |             previous_value = values.data[t]
 50 |             advantages[t] = running_advantage
 51 | 
 52 |         return returns, advantages
 53 | 
 54 |     @classmethod
 55 |     def train_model(cls, net, transitions, optimizer):
 56 |         states, actions, rewards, masks = transitions.state, transitions.action, transitions.reward, transitions.mask
 57 | 
 58 |         states = torch.stack(states)
 59 |         actions = torch.stack(actions)
 60 |         rewards = torch.Tensor(rewards)
 61 |         masks = torch.Tensor(masks)
 62 |         
 63 |         old_policies, old_values = net(states)
 64 |         old_policies = old_policies.view(-1, net.num_outputs).detach()
 65 |         returns, advantages = net.get_gae(old_values.view(-1).detach(), rewards, masks)
 66 | 
 67 |         batch_maker = BatchMaker(states, actions, returns, advantages, old_policies)
 68 |         for _ in range(epoch_k):
 69 |             for _ in range(len(states) // batch_size):
 70 |                 states_sample, actions_sample, returns_sample, advantages_sample, old_policies_sample = batch_maker.sample()
 71 |                 
 72 |                 policies, values = net(states_sample)
 73 |                 values = values.view(-1)
 74 |                 policies = policies.view(-1, net.num_outputs)
 75 | 
 76 |                 ratios = ((policies / old_policies_sample) * actions_sample.detach()).sum(dim=1)
 77 |                 
 78 |                 
 79 |                 clipped_ratios = torch.clamp(ratios, min=1.0-epsilon_clip, max=1.0+epsilon_clip)
 80 | 
 81 |                 actor_loss = -torch.min(ratios * advantages_sample,
 82 |                                         clipped_ratios * advantages_sample).sum()
 83 | 
 84 |                 critic_loss = (returns_sample.detach() - values).pow(2).sum()
 85 | 
 86 |                 policy_entropy = (torch.log(policies) * policies).sum(1, keepdim=True).mean()
 87 | 
 88 |                 loss = actor_loss + ciritic_coefficient * critic_loss - entropy_coefficient * policy_entropy
 89 | 
 90 |                 optimizer.zero_grad()
 91 |                 loss.backward()
 92 |                 optimizer.step()
 93 |         
 94 |         return loss
 95 | 
 96 |     def get_action(self, input):
 97 |         policy, _ = self.forward(input)
 98 |         
 99 |         policy = policy[0].data.numpy()
100 |         action = np.random.choice(self.num_outputs, 1, p=policy)[0]
101 |         
102 |         return action
103 | 


--------------------------------------------------------------------------------
/PG/7-PPO/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import gym
 4 | import random
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.optim as optim
 9 | import torch.nn.functional as F
10 | from model import PPO 
11 | from tensorboardX import SummaryWriter
12 | 
13 | from memory import Memory
14 | from config import env_name, goal_score, log_interval, device, gamma, lr
15 | 
16 | 
17 | def main():
18 |     env = gym.make(env_name)
19 |     env.seed(500)
20 |     torch.manual_seed(500)
21 | 
22 |     num_inputs = env.observation_space.shape[0]
23 |     num_actions = env.action_space.n
24 |     print('state size:', num_inputs)
25 |     print('action size:', num_actions)
26 | 
27 |     net = PPO(num_inputs, num_actions)
28 | 
29 |     optimizer = optim.Adam(net.parameters(), lr=lr)
30 |     writer = SummaryWriter('logs')
31 | 
32 |     net.to(device)
33 |     net.train()
34 |     running_score = 0
35 |     steps = 0
36 |     loss = 0
37 | 
38 |     for e in range(30000):
39 |         done = False
40 |         memory = Memory()
41 | 
42 |         score = 0
43 |         state = env.reset()
44 |         state = torch.Tensor(state).to(device)
45 |         state = state.unsqueeze(0)
46 | 
47 |         while not done:
48 |             steps += 1
49 | 
50 |             action = net.get_action(state)
51 |             next_state, reward, done, _ = env.step(action)
52 | 
53 |             next_state = torch.Tensor(next_state)
54 |             next_state = next_state.unsqueeze(0)
55 | 
56 |             mask = 0 if done else 1
57 |             reward = reward if not done or score == 499 else -1
58 | 
59 |             action_one_hot = torch.zeros(2)
60 |             action_one_hot[action] = 1
61 |             memory.push(state, next_state, action_one_hot, reward, mask)
62 | 
63 |             score += reward
64 |             state = next_state
65 | 
66 |         loss = PPO.train_model(net, memory.sample(), optimizer)
67 | 
68 |         score = score if score == 500.0 else score + 1
69 |         if running_score == 0:
70 |             running_score = score
71 |         running_score = 0.99 * running_score + 0.01 * score
72 |         if e % log_interval == 0:
73 |             print('{} episode | score: {:.2f}'.format(
74 |                 e, running_score))
75 |             writer.add_scalar('log/score', float(running_score), e)
76 |             writer.add_scalar('log/loss', float(loss), e)
77 | 
78 |         if running_score > goal_score:
79 |             break
80 | 
81 | 
82 | if __name__=="__main__":
83 |     main()
84 | 


--------------------------------------------------------------------------------
/POMDP/0-DQN/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.0001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | sequence_length = 4


--------------------------------------------------------------------------------
/POMDP/0-DQN/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | import torch
 4 | 
 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 6 | 
 7 | 
 8 | class Memory(object):
 9 |     def __init__(self, capacity):
10 |         self.memory = deque(maxlen=capacity)
11 |         self.capacity = capacity
12 | 
13 |     def push(self, state, next_state, action, reward, mask):
14 |         self.memory.append(Transition(torch.stack(list(state)), torch.stack(list(next_state)), action, reward, mask))
15 | 
16 |     def sample(self, batch_size):
17 |         transitions = random.sample(self.memory, batch_size)
18 |         batch = Transition(*zip(*transitions))
19 |         return batch
20 | 
21 |     def __len__(self):
22 |         return len(self.memory)
23 | 


--------------------------------------------------------------------------------
/POMDP/0-DQN/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from config import gamma, sequence_length
 6 | class QNet(nn.Module):
 7 |     def __init__(self, num_inputs, num_outputs):
 8 |         super(QNet, self).__init__()
 9 |         self.num_inputs = num_inputs
10 |         self.num_outputs = num_outputs
11 | 
12 |         self.fc1 = nn.Linear(num_inputs * sequence_length, 128)
13 |         self.fc2 = nn.Linear(128, num_outputs)
14 | 
15 |         for m in self.modules():
16 |             if isinstance(m, nn.Linear):
17 |                 nn.init.xavier_uniform(m.weight)
18 | 
19 |     def forward(self, x):
20 |         x = x.view(-1, self.num_inputs * sequence_length)
21 |         x = F.relu(self.fc1(x))
22 |         qvalue = self.fc2(x)
23 |         return qvalue
24 | 
25 |     @classmethod
26 |     def train_model(cls, online_net, target_net, optimizer, batch):
27 |         states = torch.stack(batch.state)
28 |         next_states = torch.stack(batch.next_state)
29 |         actions = torch.Tensor(batch.action).float()
30 |         rewards = torch.Tensor(batch.reward)
31 |         masks = torch.Tensor(batch.mask)
32 | 
33 |         pred = online_net(states)
34 |         next_pred = target_net(next_states)
35 | 
36 |         pred = torch.sum(pred.mul(actions), dim=1)
37 | 
38 |         target = rewards + masks * gamma * next_pred.max(1)[0]
39 | 
40 |         loss = F.mse_loss(pred, target.detach())
41 |         optimizer.zero_grad()
42 |         loss.backward()
43 |         optimizer.step()
44 | 
45 |         return loss
46 | 
47 |     def get_action(self, input):
48 |         qvalue = self.forward(input)
49 |         _, action = torch.max(qvalue, 1)
50 |         return action.numpy()[0]
51 | 


--------------------------------------------------------------------------------
/POMDP/0-DQN/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | import torch.nn.functional as F
 10 | from model import QNet
 11 | from memory import Memory
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length
 15 | from collections import deque
 16 | 
 17 | def get_action(state_series, target_net, epsilon, env):
 18 |     if np.random.rand() <= epsilon or len(state_series) < sequence_length:
 19 |         return env.action_space.sample()
 20 |     else:
 21 |         return target_net.get_action(torch.stack(list(state_series)))
 22 | 
 23 | def update_target_model(online_net, target_net):
 24 |     # Target <- Net
 25 |     target_net.load_state_dict(online_net.state_dict())
 26 | 
 27 | def state_to_partial_observability(state):
 28 |     state = state[[0, 2]]
 29 |     return state
 30 | 
 31 | def main():
 32 |     env = gym.make(env_name)
 33 |     env.seed(500)
 34 |     torch.manual_seed(500)
 35 | 
 36 |     num_inputs = 2
 37 |     num_actions = env.action_space.n
 38 |     print('state size:', num_inputs)
 39 |     print('action size:', num_actions)
 40 | 
 41 |     online_net = QNet(num_inputs, num_actions)
 42 |     target_net = QNet(num_inputs, num_actions)
 43 |     update_target_model(online_net, target_net)
 44 | 
 45 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 46 |     writer = SummaryWriter('logs')
 47 | 
 48 |     online_net.to(device)
 49 |     target_net.to(device)
 50 |     online_net.train()
 51 |     target_net.train()
 52 |     memory = Memory(replay_memory_capacity)
 53 |     running_score = 0
 54 |     epsilon = 1.0
 55 |     steps = 0
 56 |     loss = 0
 57 | 
 58 |     for e in range(30000):
 59 |         done = False
 60 | 
 61 |         state_series = deque(maxlen=sequence_length)
 62 |         next_state_series = deque(maxlen=sequence_length)
 63 |         score = 0
 64 |         state = env.reset()
 65 |         
 66 |         state = state_to_partial_observability(state)
 67 |         state = torch.Tensor(state).to(device)
 68 |         
 69 |         next_state_series.append(state)
 70 |         while not done:
 71 |             steps += 1
 72 |             state_series.append(state)
 73 |             action = get_action(state_series, target_net, epsilon, env)
 74 |             next_state, reward, done, _ = env.step(action)
 75 | 
 76 |             next_state = state_to_partial_observability(next_state)
 77 |             next_state = torch.Tensor(next_state)
 78 | 
 79 |             mask = 0 if done else 1
 80 |             reward = reward if not done or score == 499 else -1
 81 |             action_one_hot = np.zeros(2)
 82 |             action_one_hot[action] = 1
 83 |             if len(state_series) >= sequence_length:
 84 |                 memory.push(state_series, next_state_series, action_one_hot, reward, mask)
 85 | 
 86 |             score += reward
 87 |             state = next_state
 88 | 
 89 |             if steps > initial_exploration:
 90 |                 epsilon -= 0.000005
 91 |                 epsilon = max(epsilon, 0.1)
 92 | 
 93 |                 batch = memory.sample(batch_size)
 94 |                 loss = QNet.train_model(online_net, target_net, optimizer, batch)
 95 | 
 96 |                 if steps % update_target == 0:
 97 |                     update_target_model(online_net, target_net)
 98 | 
 99 |         score = score if score == 500.0 else score + 1
100 |         if running_score == 0:
101 |             running_score = score
102 |         else:
103 |             running_score = 0.99 * running_score + 0.01 * score
104 |         if e % log_interval == 0:
105 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
106 |                 e, running_score, epsilon))
107 |             writer.add_scalar('log/score', float(running_score), e)
108 |             writer.add_scalar('log/loss', float(loss), e)
109 | 
110 |         if running_score > goal_score:
111 |             break
112 | 
113 | 
114 | if __name__=="__main__":
115 |     main()
116 | 


--------------------------------------------------------------------------------
/POMDP/1-DRQN/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 100
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | sequence_length = 8
15 | burn_in_length = 4


--------------------------------------------------------------------------------
/POMDP/1-DRQN/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | from config import sequence_length
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 8 | 
 9 | class Memory(object):
10 |     def __init__(self, capacity):
11 |         self.memory = deque(maxlen=capacity)
12 |         self.local_memory = []
13 |         self.capacity = capacity
14 | 
15 |     def push(self, state, next_state, action, reward, mask):
16 |         self.local_memory.append(Transition(state, next_state, action, reward, mask))
17 |         if mask == 0:
18 |             while len(self.local_memory) < sequence_length:
19 |                 self.local_memory.insert(0, Transition(
20 |                     torch.Tensor([0, 0]),
21 |                     torch.Tensor([0, 0]),
22 |                     0,
23 |                     0,
24 |                     0,
25 |                 ))
26 |             self.memory.append(self.local_memory)
27 |             self.local_memory = []
28 | 
29 |     def sample(self, batch_size):
30 |         batch_state, batch_next_state, batch_action, batch_reward, batch_mask = [], [], [], [], []
31 |         p = np.array([len(episode) for episode in self.memory])
32 |         p = p / p.sum()
33 | 
34 |         batch_indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p)
35 |         
36 |         for batch_idx in batch_indexes:
37 |             episode = self.memory[batch_idx]
38 |             
39 |             start = random.randint(0, len(episode) - sequence_length)
40 |             transitions = episode[start:start + sequence_length]
41 |             batch = Transition(*zip(*transitions))
42 | 
43 |             batch_state.append(torch.stack(list(batch.state)))
44 |             batch_next_state.append(torch.stack(list(batch.next_state)))
45 |             batch_action.append(torch.Tensor(list(batch.action)))
46 |             batch_reward.append(torch.Tensor(list(batch.reward)))
47 |             batch_mask.append(torch.Tensor(list(batch.mask)))
48 |         
49 |         return Transition(batch_state, batch_next_state, batch_action, batch_reward, batch_mask)
50 | 
51 |     def __len__(self):
52 |         return len(self.memory)


--------------------------------------------------------------------------------
/POMDP/1-DRQN/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from config import gamma, device, batch_size, sequence_length, burn_in_length
 6 | 
 7 | class DRQN(nn.Module):
 8 |     def __init__(self, num_inputs, num_outputs):
 9 |         super(DRQN, self).__init__()
10 |         self.num_inputs = num_inputs
11 |         self.num_outputs = num_outputs
12 | 
13 |         self.lstm = nn.LSTM(input_size=num_inputs, hidden_size=128, batch_first=True)
14 |         self.fc1 = nn.Linear(128, 256)
15 |         self.fc2 = nn.Linear(256, num_outputs)
16 | 
17 |         for m in self.modules():
18 |             if isinstance(m, nn.Linear):
19 |                 nn.init.xavier_uniform(m.weight)
20 | 
21 |     def forward(self, x, hidden=None):
22 |         # x [batch_size, sequence_length, num_inputs]
23 | 
24 |         if hidden is not None:
25 |             out, hidden = self.lstm(x, hidden)
26 |         else:
27 |             out, hidden = self.lstm(x)
28 |         out = F.relu(self.fc1(out))
29 |         qvalue = self.fc2(out)
30 | 
31 |         return qvalue, hidden
32 | 
33 | 
34 |     @classmethod
35 |     def train_model(cls, online_net, target_net, optimizer, batch):
36 |         def slice_burn_in(item):
37 |             return item[:, burn_in_length:, :]
38 |         states = torch.stack(batch.state).view(batch_size, sequence_length, online_net.num_inputs)
39 |         next_states = torch.stack(batch.next_state).view(batch_size, sequence_length, online_net.num_inputs)
40 |         actions = torch.stack(batch.action).view(batch_size, sequence_length, -1).long()
41 |         rewards = torch.stack(batch.reward).view(batch_size, sequence_length, -1)
42 |         masks = torch.stack(batch.mask).view(batch_size, sequence_length, -1)
43 | 
44 |         pred, _ = online_net(states)
45 |         next_pred, _ = target_net(next_states)
46 | 
47 |         pred = slice_burn_in(pred)
48 |         next_pred = slice_burn_in(next_pred)
49 |         actions = slice_burn_in(actions)
50 |         rewards = slice_burn_in(rewards)
51 |         masks = slice_burn_in(masks)
52 |         
53 |         pred = pred.gather(2, actions)
54 |         
55 |         target = rewards + masks * gamma * next_pred.max(2, keepdim=True)[0]
56 | 
57 |         loss = F.mse_loss(pred, target.detach())
58 |         optimizer.zero_grad()
59 |         loss.backward()
60 |         optimizer.step()
61 | 
62 |         return loss
63 | 
64 |     def get_action(self, state, hidden):
65 |         state = state.unsqueeze(0).unsqueeze(0)
66 | 
67 |         qvalue, hidden = self.forward(state, hidden)
68 | 
69 |         _, action = torch.max(qvalue, 2)
70 |         
71 |         return action.numpy()[0][0], hidden
72 | 


--------------------------------------------------------------------------------
/POMDP/1-DRQN/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | import torch.nn.functional as F
 10 | from model import DRQN
 11 | from memory import Memory
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length
 15 | 
 16 | from collections import deque
 17 | 
 18 | def get_action(state, target_net, epsilon, env, hidden):
 19 |     action, hidden = target_net.get_action(state, hidden)
 20 |     
 21 |     if np.random.rand() <= epsilon:
 22 |         return env.action_space.sample(), hidden
 23 |     else:
 24 |         return action, hidden
 25 | 
 26 | def update_target_model(online_net, target_net):
 27 |     # Target <- Net
 28 |     target_net.load_state_dict(online_net.state_dict())
 29 | 
 30 | def state_to_partial_observability(state):
 31 |     state = state[[0, 2]]
 32 |     return state
 33 | 
 34 | def main():
 35 |     env = gym.make(env_name)
 36 |     env.seed(500)
 37 |     torch.manual_seed(500)
 38 | 
 39 |     # num_inputs = env.observation_space.shape[0]
 40 |     num_inputs = 2
 41 |     num_actions = env.action_space.n
 42 |     print('state size:', num_inputs)
 43 |     print('action size:', num_actions)
 44 | 
 45 |     online_net = DRQN(num_inputs, num_actions)
 46 |     target_net = DRQN(num_inputs, num_actions)
 47 |     update_target_model(online_net, target_net)
 48 | 
 49 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 50 |     writer = SummaryWriter('logs')
 51 | 
 52 |     online_net.to(device)
 53 |     target_net.to(device)
 54 |     online_net.train()
 55 |     target_net.train()
 56 |     memory = Memory(replay_memory_capacity)
 57 |     running_score = 0
 58 |     epsilon = 1.0
 59 |     steps = 0
 60 |     loss = 0
 61 | 
 62 |     for e in range(30000):
 63 |         done = False
 64 | 
 65 |         score = 0
 66 |         state = env.reset()
 67 |         state = state_to_partial_observability(state)
 68 |         state = torch.Tensor(state).to(device)
 69 | 
 70 |         hidden = None
 71 | 
 72 |         while not done:
 73 |             steps += 1
 74 | 
 75 |             action, hidden = get_action(state, target_net, epsilon, env, hidden)
 76 |             next_state, reward, done, _ = env.step(action)
 77 | 
 78 |             next_state = state_to_partial_observability(next_state)
 79 |             next_state = torch.Tensor(next_state)
 80 | 
 81 |             mask = 0 if done else 1
 82 |             reward = reward if not done or score == 499 else -1
 83 | 
 84 |             memory.push(state, next_state, action, reward, mask)
 85 | 
 86 |             score += reward
 87 |             state = next_state
 88 | 
 89 |             
 90 |             if steps > initial_exploration and len(memory) > batch_size:
 91 |                 epsilon -= 0.00005
 92 |                 epsilon = max(epsilon, 0.1)
 93 | 
 94 |                 batch = memory.sample(batch_size)
 95 |                 loss = DRQN.train_model(online_net, target_net, optimizer, batch)
 96 | 
 97 |                 if steps % update_target == 0:
 98 |                     update_target_model(online_net, target_net)
 99 | 
100 |         score = score if score == 500.0 else score + 1
101 |         if running_score == 0:
102 |             running_score = score
103 |         else:
104 |             running_score = 0.99 * running_score + 0.01 * score
105 |         if e % log_interval == 0:
106 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
107 |                 e, running_score, epsilon))
108 |             writer.add_scalar('log/score', float(running_score), e)
109 |             writer.add_scalar('log/loss', float(loss), e)
110 | 
111 |         if running_score > goal_score:
112 |             break
113 | 
114 | 
115 | if __name__=="__main__":
116 |     main()
117 | 


--------------------------------------------------------------------------------
/POMDP/2-DRQN-Stack/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 100
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | sequence_length = 8
15 | burn_in_length = 4


--------------------------------------------------------------------------------
/POMDP/2-DRQN-Stack/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | from config import sequence_length
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 8 | 
 9 | class Memory(object):
10 |     def __init__(self, capacity):
11 |         self.memory = deque(maxlen=capacity)
12 |         self.local_memory = []
13 |         self.capacity = capacity
14 | 
15 |     def push(self, state, next_state, action, reward, mask):
16 |         self.local_memory.append(Transition(state, next_state, action, reward, mask))
17 |         if mask == 0:
18 |             while len(self.local_memory) < sequence_length:
19 |                 self.local_memory.insert(0, Transition(
20 |                     torch.Tensor([0, 0]),
21 |                     torch.Tensor([0, 0]),
22 |                     0,
23 |                     0,
24 |                     0,
25 |                 ))
26 |             self.memory.append(self.local_memory)
27 |             self.local_memory = []
28 | 
29 |     def sample(self, batch_size):
30 |         batch_state, batch_next_state, batch_action, batch_reward, batch_mask = [], [], [], [], []
31 |         p = np.array([len(episode) for episode in self.memory])
32 |         p = p / p.sum()
33 | 
34 |         batch_indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p)
35 |         
36 |         for batch_idx in batch_indexes:
37 |             episode = self.memory[batch_idx]
38 | 
39 |             start = random.randint(0, len(episode) - sequence_length)
40 |             transitions = episode[start:start + sequence_length]
41 |             batch = Transition(*zip(*transitions))
42 | 
43 |             batch_state.append(torch.stack(list(batch.state)))
44 |             batch_next_state.append(torch.stack(list(batch.next_state)))
45 |             batch_action.append(torch.Tensor(list(batch.action)))
46 |             batch_reward.append(torch.Tensor(list(batch.reward)))
47 |             batch_mask.append(torch.Tensor(list(batch.mask)))
48 |         
49 |         return Transition(batch_state, batch_next_state, batch_action, batch_reward, batch_mask)
50 | 
51 |     def __len__(self):
52 |         return len(self.memory)


--------------------------------------------------------------------------------
/POMDP/2-DRQN-Stack/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from config import gamma, device, batch_size, sequence_length, burn_in_length
 6 | 
 7 | class DRQN(nn.Module):
 8 |     def __init__(self, num_inputs, num_outputs):
 9 |         super(DRQN, self).__init__()
10 |         self.num_inputs = num_inputs
11 |         self.num_outputs = num_outputs
12 | 
13 |         self.lstm = nn.LSTM(input_size=num_inputs, hidden_size=128, batch_first=True)
14 |         self.fc1 = nn.Linear(128, 256)
15 |         self.fc2 = nn.Linear(256, num_outputs)
16 | 
17 |         for m in self.modules():
18 |             if isinstance(m, nn.Linear):
19 |                 nn.init.xavier_uniform(m.weight)
20 | 
21 |     def forward(self, x):
22 |         # x [batch_size, sequence_length, num_inputs]
23 |         out, hidden = self.lstm(x)
24 |         out = F.relu(self.fc1(out))
25 |         qvalue = self.fc2(out)
26 | 
27 |         return qvalue
28 | 
29 | 
30 |     @classmethod
31 |     def train_model(cls, online_net, target_net, optimizer, batch):
32 |         def slice_burn_in(item):
33 |             return item[:, burn_in_length:, :]
34 |         states = torch.stack(batch.state).view(batch_size, sequence_length, online_net.num_inputs)
35 |         next_states = torch.stack(batch.next_state).view(batch_size, sequence_length, online_net.num_inputs)
36 |         actions = torch.stack(batch.action).view(batch_size, sequence_length, -1).long()
37 |         rewards = torch.stack(batch.reward).view(batch_size, sequence_length, -1)
38 |         masks = torch.stack(batch.mask).view(batch_size, sequence_length, -1)
39 | 
40 |         pred = online_net(states)
41 |         next_pred = target_net(next_states)
42 | 
43 |         pred = slice_burn_in(pred)
44 |         next_pred = slice_burn_in(next_pred)
45 |         actions = slice_burn_in(actions)
46 |         rewards = slice_burn_in(rewards)
47 |         masks = slice_burn_in(masks)
48 |         
49 |         pred = pred.gather(2, actions)
50 |         
51 |         target = rewards + masks * gamma * next_pred.max(2, keepdim=True)[0]
52 | 
53 |         loss = F.mse_loss(pred, target.detach())
54 |         optimizer.zero_grad()
55 |         loss.backward()
56 |         optimizer.step()
57 | 
58 |         return loss
59 | 
60 |     def get_action(self, state_series):
61 |         state_series = torch.stack(list(state_series))
62 |         state_series = state_series.unsqueeze(0)
63 | 
64 |         qvalue = self.forward(state_series)
65 |         
66 |         _, action = torch.max(qvalue, 2)
67 |         
68 |         return action.numpy()[0][-1]
69 | 


--------------------------------------------------------------------------------
/POMDP/2-DRQN-Stack/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | import torch.nn.functional as F
 10 | from model import DRQN
 11 | from memory import Memory
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length
 15 | 
 16 | from collections import deque
 17 | 
 18 | def get_action(state_series, target_net, epsilon, env):
 19 |     if np.random.rand() <= epsilon or len(state_series) < sequence_length:
 20 |         return env.action_space.sample()
 21 |     else:
 22 |         return target_net.get_action(state_series)
 23 | 
 24 | def update_target_model(online_net, target_net):
 25 |     # Target <- Net
 26 |     target_net.load_state_dict(online_net.state_dict())
 27 | 
 28 | 
 29 | def main():
 30 |     env = gym.make(env_name)
 31 |     env.seed(500)
 32 |     torch.manual_seed(500)
 33 | 
 34 |     # num_inputs = env.observation_space.shape[0]
 35 |     num_inputs = 2
 36 |     num_actions = env.action_space.n
 37 |     print('state size:', num_inputs)
 38 |     print('action size:', num_actions)
 39 | 
 40 |     online_net = DRQN(num_inputs, num_actions)
 41 |     target_net = DRQN(num_inputs, num_actions)
 42 |     update_target_model(online_net, target_net)
 43 | 
 44 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 45 |     writer = SummaryWriter('logs')
 46 | 
 47 |     online_net.to(device)
 48 |     target_net.to(device)
 49 |     online_net.train()
 50 |     target_net.train()
 51 |     memory = Memory(replay_memory_capacity)
 52 |     running_score = 0
 53 |     epsilon = 1.0
 54 |     steps = 0
 55 |     loss = 0
 56 | 
 57 |     for e in range(30000):
 58 |         done = False
 59 |         
 60 |         state_series = deque(maxlen=sequence_length)
 61 |         score = 0
 62 |         state = env.reset()
 63 |         state = torch.Tensor(state[[0, 2]]).to(device)
 64 |         
 65 |         # state = torch.Tensor(state).to(device)
 66 | 
 67 |         while not done:
 68 |             steps += 1
 69 |             state_series.append(state)
 70 | 
 71 |             action = get_action(state_series, target_net, epsilon, env)
 72 |             next_state, reward, done, _ = env.step(action)
 73 | 
 74 |             next_state = torch.Tensor(next_state[[0, 2]])
 75 |             # next_state = torch.Tensor(next_state)
 76 | 
 77 |             mask = 0 if done else 1
 78 |             reward = reward if not done or score == 499 else -1
 79 | 
 80 |             memory.push(state, next_state, action, reward, mask)
 81 | 
 82 |             score += reward
 83 |             state = next_state
 84 | 
 85 |             
 86 |             if steps > initial_exploration and len(memory) > batch_size:
 87 |                 epsilon -= 0.00005
 88 |                 epsilon = max(epsilon, 0.1)
 89 | 
 90 |                 batch = memory.sample(batch_size)
 91 |                 loss = DRQN.train_model(online_net, target_net, optimizer, batch)
 92 | 
 93 |                 if steps % update_target == 0:
 94 |                     update_target_model(online_net, target_net)
 95 | 
 96 |         score = score if score == 500.0 else score + 1
 97 |         if running_score == 0:
 98 |             running_score = score
 99 |         else:
100 |             running_score = 0.99 * running_score + 0.01 * score
101 |         if e % log_interval == 0:
102 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
103 |                 e, running_score, epsilon))
104 |             writer.add_scalar('log/score', float(running_score), e)
105 |             writer.add_scalar('log/loss', float(loss), e)
106 | 
107 |         if running_score > goal_score:
108 |             break
109 | 
110 | 
111 | if __name__=="__main__":
112 |     main()
113 | 


--------------------------------------------------------------------------------
/POMDP/3-DRQN-Store-State/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 100
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | sequence_length = 8
15 | burn_in_length = 4


--------------------------------------------------------------------------------
/POMDP/3-DRQN-Store-State/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | from config import sequence_length
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask', 'rnn_state'))
 8 | 
 9 | class Memory(object):
10 |     def __init__(self, capacity):
11 |         self.memory = deque(maxlen=capacity)
12 |         self.local_memory = []
13 |         self.capacity = capacity
14 | 
15 |     def push(self, state, next_state, action, reward, mask, rnn_state):
16 |         self.local_memory.append(Transition(state, next_state, action, reward, mask, torch.stack(rnn_state).view(2, -1)))
17 |         if mask == 0:
18 |             self.memory.append(self.local_memory)
19 |             self.local_memory = []
20 | 
21 |     def sample(self, batch_size):
22 |         batch_state, batch_next_state, batch_action, batch_reward, batch_mask, batch_rnn_state = [], [], [], [], [], []
23 |         p = np.array([len(episode) for episode in self.memory])
24 |         p = p / p.sum()
25 | 
26 |         batch_indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p)
27 |         
28 |         for batch_idx in batch_indexes:
29 |             episode = self.memory[batch_idx]
30 | 
31 |             start = random.randint(0, len(episode) - sequence_length)
32 |             transitions = episode[start:start + sequence_length]
33 |             batch = Transition(*zip(*transitions))
34 | 
35 |             batch_state.append(torch.stack(list(batch.state)))
36 |             batch_next_state.append(torch.stack(list(batch.next_state)))
37 |             batch_action.append(torch.Tensor(list(batch.action)))
38 |             batch_reward.append(torch.Tensor(list(batch.reward)))
39 |             batch_mask.append(torch.Tensor(list(batch.mask)))
40 |             batch_rnn_state.append(torch.stack(list(batch.rnn_state)))
41 |         
42 |         return Transition(batch_state, batch_next_state, batch_action, batch_reward, batch_mask, batch_rnn_state)
43 | 
44 |     def __len__(self):
45 |         return len(self.memory)


--------------------------------------------------------------------------------
/POMDP/3-DRQN-Store-State/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from config import gamma, device, batch_size, sequence_length, burn_in_length
 6 | 
 7 | class DRQN(nn.Module):
 8 |     def __init__(self, num_inputs, num_outputs):
 9 |         super(DRQN, self).__init__()
10 |         self.num_inputs = num_inputs
11 |         self.num_outputs = num_outputs
12 | 
13 |         self.lstm = nn.LSTM(input_size=num_inputs, hidden_size=16, batch_first=True)
14 |         self.fc1 = nn.Linear(16, 128)
15 |         self.fc2 = nn.Linear(128, num_outputs)
16 | 
17 |         for m in self.modules():
18 |             if isinstance(m, nn.Linear):
19 |                 nn.init.xavier_uniform(m.weight)
20 | 
21 |     def forward(self, x, hidden=None):
22 |         # x [batch_size, sequence_length, num_inputs]
23 |         out, hidden = self.lstm(x, hidden)
24 | 
25 |         out = F.relu(self.fc1(out))
26 |         qvalue = self.fc2(out)
27 | 
28 |         return qvalue, hidden
29 | 
30 | 
31 |     @classmethod
32 |     def train_model(cls, online_net, target_net, optimizer, batch):
33 |         def slice_burn_in(item):
34 |             return item[:, burn_in_length:, :]
35 |         states = torch.stack(batch.state).view(batch_size, sequence_length, online_net.num_inputs)
36 |         next_states = torch.stack(batch.next_state).view(batch_size, sequence_length, online_net.num_inputs)
37 |         actions = torch.stack(batch.action).view(batch_size, sequence_length, -1).long()
38 |         rewards = torch.stack(batch.reward).view(batch_size, sequence_length, -1)
39 |         masks = torch.stack(batch.mask).view(batch_size, sequence_length, -1)
40 |         rnn_state = torch.stack(batch.rnn_state).view(batch_size, sequence_length, 2, -1)
41 | 
42 |         
43 | 
44 |         [h0, c0] = rnn_state[:, 0, :, :].transpose(0, 1)
45 |         h0 = h0.unsqueeze(0).detach()
46 |         c0 = c0.unsqueeze(0).detach()
47 | 
48 |         [h1, c1] = rnn_state[:, 1, :, :].transpose(0, 1)
49 |         h1 = h1.unsqueeze(0).detach()
50 |         c1 = c1.unsqueeze(0).detach()
51 | 
52 |         pred, _ = online_net(states, (h0, c0))
53 |         next_pred, _ = target_net(next_states, (h1, c1))
54 | 
55 |         pred = slice_burn_in(pred)
56 |         next_pred = slice_burn_in(next_pred)
57 |         actions = slice_burn_in(actions)
58 |         rewards = slice_burn_in(rewards)
59 |         masks = slice_burn_in(masks)
60 |         
61 |         pred = pred.gather(2, actions)
62 |         
63 |         target = rewards + masks * gamma * next_pred.max(2, keepdim=True)[0]
64 | 
65 |         loss = F.mse_loss(pred, target.detach())
66 |         optimizer.zero_grad()
67 |         loss.backward()
68 |         optimizer.step()
69 | 
70 |         return loss
71 | 
72 |     def get_action(self, state, hidden):
73 |         state = state.unsqueeze(0).unsqueeze(0)
74 | 
75 |         qvalue, hidden = self.forward(state, hidden)
76 |             
77 |         _, action = torch.max(qvalue, 2)
78 |         return action.numpy()[0][0], hidden
79 | 


--------------------------------------------------------------------------------
/POMDP/3-DRQN-Store-State/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | import torch.nn.functional as F
 10 | from model import DRQN
 11 | from memory import Memory
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length
 15 | 
 16 | from collections import deque
 17 | 
 18 | def get_action(state, target_net, epsilon, env, hidden):
 19 |     action, hidden = target_net.get_action(state, hidden)
 20 |     
 21 |     if np.random.rand() <= epsilon:
 22 |         return env.action_space.sample(), hidden
 23 |     else:
 24 |         return action, hidden
 25 | 
 26 | def update_target_model(online_net, target_net):
 27 |     # Target <- Net
 28 |     target_net.load_state_dict(online_net.state_dict())
 29 | 
 30 | def state_to_partial_observability(state):
 31 |     state = state[[0, 2]]
 32 |     return state
 33 | 
 34 | 
 35 | def main():
 36 |     env = gym.make(env_name)
 37 |     env.seed(500)
 38 |     torch.manual_seed(500)
 39 | 
 40 |     # num_inputs = env.observation_space.shape[0]
 41 |     num_inputs = 2
 42 |     num_actions = env.action_space.n
 43 |     print('state size:', num_inputs)
 44 |     print('action size:', num_actions)
 45 | 
 46 |     online_net = DRQN(num_inputs, num_actions)
 47 |     target_net = DRQN(num_inputs, num_actions)
 48 |     update_target_model(online_net, target_net)
 49 | 
 50 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 51 |     writer = SummaryWriter('logs')
 52 | 
 53 |     online_net.to(device)
 54 |     target_net.to(device)
 55 |     online_net.train()
 56 |     target_net.train()
 57 |     memory = Memory(replay_memory_capacity)
 58 |     running_score = 0
 59 |     epsilon = 1.0
 60 |     steps = 0
 61 |     loss = 0
 62 | 
 63 |     for e in range(30000):
 64 |         done = False
 65 | 
 66 |         score = 0
 67 |         state = env.reset()
 68 |         state = state_to_partial_observability(state)
 69 |         state = torch.Tensor(state).to(device)
 70 | 
 71 |         hidden = (torch.Tensor().new_zeros(1, 1, 16), torch.Tensor().new_zeros(1, 1, 16))
 72 | 
 73 |         while not done:
 74 |             steps += 1
 75 | 
 76 |             action, new_hidden = get_action(state, target_net, epsilon, env, hidden)
 77 |             next_state, reward, done, _ = env.step(action)
 78 | 
 79 |             next_state = state_to_partial_observability(next_state)
 80 |             next_state = torch.Tensor(next_state)
 81 | 
 82 |             mask = 0 if done else 1
 83 |             reward = reward if not done or score == 499 else -1
 84 | 
 85 |             memory.push(state, next_state, action, reward, mask, hidden)
 86 |             hidden = new_hidden
 87 | 
 88 |             score += reward
 89 |             state = next_state
 90 | 
 91 |             
 92 |             if steps > initial_exploration and len(memory) > batch_size:
 93 |                 epsilon -= 0.00005
 94 |                 epsilon = max(epsilon, 0.1)
 95 | 
 96 |                 batch = memory.sample(batch_size)
 97 |                 loss = DRQN.train_model(online_net, target_net, optimizer, batch)
 98 | 
 99 |                 if steps % update_target == 0:
100 |                     update_target_model(online_net, target_net)
101 | 
102 |         score = score if score == 500.0 else score + 1
103 |         if running_score == 0:
104 |             running_score = score
105 |         else:
106 |             running_score = 0.99 * running_score + 0.01 * score
107 |         if e % log_interval == 0:
108 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
109 |                 e, running_score, epsilon))
110 |             writer.add_scalar('log/score', float(running_score), e)
111 |             writer.add_scalar('log/loss', float(loss), e)
112 | 
113 |         if running_score > goal_score:
114 |             break
115 | 
116 | 
117 | if __name__=="__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/POMDP/4-R2D2-Single/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | sequence_length = 32
15 | burn_in_length = 4
16 | eta = 0.9
17 | local_mini_batch = 8
18 | n_step = 2
19 | over_lapping_length = 16


--------------------------------------------------------------------------------
/POMDP/4-R2D2-Single/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from config import gamma, device, batch_size, sequence_length, burn_in_length
  6 | 
  7 | class R2D2(nn.Module):
  8 |     def __init__(self, num_inputs, num_outputs):
  9 |         super(R2D2, self).__init__()
 10 |         self.num_inputs = num_inputs
 11 |         self.num_outputs = num_outputs
 12 | 
 13 |         self.lstm = nn.LSTM(input_size=num_inputs, hidden_size=16, batch_first=True)
 14 |         self.fc = nn.Linear(16, 128)
 15 |         self.fc_adv = nn.Linear(128, num_outputs)
 16 |         self.fc_val = nn.Linear(128, 1)
 17 | 
 18 |         for m in self.modules():
 19 |             if isinstance(m, nn.Linear):
 20 |                 nn.init.xavier_uniform(m.weight)
 21 | 
 22 |     def forward(self, x, hidden=None):
 23 |         # x [batch_size, sequence_length, num_inputs]
 24 |         batch_size = x.size()[0]
 25 |         sequence_length = x.size()[1]
 26 |         out, hidden = self.lstm(x, hidden)
 27 | 
 28 |         out = F.relu(self.fc(out))
 29 |         adv = self.fc_adv(out)
 30 |         adv = adv.view(batch_size, sequence_length, self.num_outputs)
 31 |         val = self.fc_val(out)
 32 |         val = val.view(batch_size, sequence_length, 1)
 33 | 
 34 |         qvalue = val + (adv - adv.mean(dim=2, keepdim=True))
 35 | 
 36 |         return qvalue, hidden
 37 | 
 38 |     @classmethod
 39 |     def get_td_error(cls, online_net, target_net, batch, lengths):
 40 |         def slice_burn_in(item):
 41 |             return item[:, burn_in_length:, :]
 42 |         batch_size = torch.stack(batch.state).size()[0]
 43 |         states = torch.stack(batch.state).view(batch_size, sequence_length, online_net.num_inputs)
 44 |         next_states = torch.stack(batch.next_state).view(batch_size, sequence_length, online_net.num_inputs)
 45 |         actions = torch.stack(batch.action).view(batch_size, sequence_length, -1).long()
 46 |         rewards = torch.stack(batch.reward).view(batch_size, sequence_length, -1)
 47 |         masks = torch.stack(batch.mask).view(batch_size, sequence_length, -1)
 48 |         steps = torch.stack(batch.step).view(batch_size, sequence_length, -1)
 49 |         rnn_state = torch.stack(batch.rnn_state).view(batch_size, sequence_length, 2, -1)
 50 | 
 51 |         [h0, c0] = rnn_state[:, 0, :, :].transpose(0, 1)
 52 |         h0 = h0.unsqueeze(0).detach()
 53 |         c0 = c0.unsqueeze(0).detach()
 54 | 
 55 |         [h1, c1] = rnn_state[:, 1, :, :].transpose(0, 1)
 56 |         h1 = h1.unsqueeze(0).detach()
 57 |         c1 = c1.unsqueeze(0).detach()
 58 | 
 59 |         pred, _ = online_net(states, (h0, c0))
 60 |         next_pred, _ = target_net(next_states, (h1, c1))
 61 | 
 62 |         next_pred_online, _ = online_net(next_states, (h1, c1))
 63 | 
 64 |         pred = slice_burn_in(pred)
 65 |         next_pred = slice_burn_in(next_pred)
 66 |         actions = slice_burn_in(actions)
 67 |         rewards = slice_burn_in(rewards)
 68 |         masks = slice_burn_in(masks)
 69 |         steps = slice_burn_in(steps)
 70 |         next_pred_online = slice_burn_in(next_pred_online)
 71 |         
 72 |         pred = pred.gather(2, actions)
 73 | 
 74 |         _, next_pred_online_action = next_pred_online.max(2)
 75 |         
 76 |         target = rewards + masks * pow(gamma, steps) * next_pred.gather(2, next_pred_online_action.unsqueeze(2))
 77 | 
 78 |         td_error = pred - target.detach()
 79 | 
 80 |         for idx, length in enumerate(lengths):
 81 |             td_error[idx][length-burn_in_length:][:] = 0
 82 | 
 83 |         return td_error
 84 |         
 85 |     @classmethod
 86 |     def train_model(cls, online_net, target_net, optimizer, batch, lengths):
 87 |         td_error = cls.get_td_error(online_net, target_net, batch, lengths)
 88 | 
 89 |         loss = pow(td_error, 2).mean()
 90 | 
 91 |         optimizer.zero_grad()
 92 |         loss.backward()
 93 |         optimizer.step()
 94 | 
 95 |         return loss, td_error
 96 | 
 97 |     def get_action(self, state, hidden):
 98 |         state = state.unsqueeze(0).unsqueeze(0)
 99 | 
100 |         qvalue, hidden = self.forward(state, hidden)
101 | 
102 |         _, action = torch.max(qvalue, 2)
103 |         return action.numpy()[0][0], hidden
104 | 


--------------------------------------------------------------------------------
/POMDP/4-R2D2-Single/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | import torch.nn.functional as F
 10 | from model import R2D2 
 11 | from memory import Memory, LocalBuffer
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, sequence_length, local_mini_batch
 15 | 
 16 | from collections import deque
 17 | 
 18 | def get_action(state, target_net, epsilon, env, hidden):
 19 |     action, hidden = target_net.get_action(state, hidden)
 20 |     
 21 |     if np.random.rand() <= epsilon:
 22 |         return env.action_space.sample(), hidden
 23 |     else:
 24 |         return action, hidden
 25 | 
 26 | def update_target_model(online_net, target_net):
 27 |     # Target <- Net
 28 |     target_net.load_state_dict(online_net.state_dict())
 29 | 
 30 | def state_to_partial_observability(state):
 31 |     state = state[[0, 2]]
 32 |     return state
 33 | 
 34 | def main():
 35 |     env = gym.make(env_name)
 36 |     env.seed(500)
 37 |     torch.manual_seed(500)
 38 | 
 39 |     # num_inputs = env.observation_space.shape[0]
 40 |     num_inputs = 2
 41 |     num_actions = env.action_space.n
 42 |     print('state size:', num_inputs)
 43 |     print('action size:', num_actions)
 44 | 
 45 |     online_net = R2D2(num_inputs, num_actions)
 46 |     target_net = R2D2(num_inputs, num_actions)
 47 |     update_target_model(online_net, target_net)
 48 | 
 49 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 50 |     writer = SummaryWriter('logs')
 51 | 
 52 |     online_net.to(device)
 53 |     target_net.to(device)
 54 |     online_net.train()
 55 |     target_net.train()
 56 |     memory = Memory(replay_memory_capacity)
 57 |     running_score = 0
 58 |     epsilon = 1.0
 59 |     steps = 0
 60 |     loss = 0
 61 |     local_buffer = LocalBuffer()
 62 | 
 63 |     for e in range(30000):
 64 |         done = False
 65 | 
 66 |         score = 0
 67 |         state = env.reset()
 68 |         state = state_to_partial_observability(state)
 69 |         state = torch.Tensor(state).to(device)
 70 | 
 71 |         hidden = (torch.Tensor().new_zeros(1, 1, 16), torch.Tensor().new_zeros(1, 1, 16))
 72 | 
 73 |         while not done:
 74 |             steps += 1
 75 | 
 76 |             action, new_hidden = get_action(state, target_net, epsilon, env, hidden)
 77 | 
 78 |             next_state, reward, done, _ = env.step(action)
 79 | 
 80 |             next_state = state_to_partial_observability(next_state)
 81 |             next_state = torch.Tensor(next_state)
 82 | 
 83 |             mask = 0 if done else 1
 84 |             reward = reward if not done or score == 499 else -1
 85 | 
 86 |             local_buffer.push(state, next_state, action, reward, mask, hidden)
 87 |             hidden = new_hidden
 88 |             if len(local_buffer.memory) == local_mini_batch:
 89 |                 batch, lengths = local_buffer.sample()
 90 |                 td_error = R2D2.get_td_error(online_net, target_net, batch, lengths)
 91 |                 memory.push(td_error, batch, lengths)
 92 | 
 93 |             score += reward
 94 |             state = next_state
 95 | 
 96 |             if steps > initial_exploration and len(memory) > batch_size:
 97 |                 epsilon -= 0.00005
 98 |                 epsilon = max(epsilon, 0.1)
 99 | 
100 |                 batch, indexes, lengths = memory.sample(batch_size)
101 |                 loss, td_error = R2D2.train_model(online_net, target_net, optimizer, batch, lengths)
102 | 
103 |                 memory.update_prior(indexes, td_error, lengths)
104 | 
105 |                 if steps % update_target == 0:
106 |                     update_target_model(online_net, target_net)
107 | 
108 |         score = score if score == 500.0 else score + 1
109 |         if running_score == 0:
110 |             running_score = score
111 |         else:
112 |             running_score = 0.99 * running_score + 0.01 * score
113 |         if e % log_interval == 0:
114 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
115 |                 e, running_score, epsilon))
116 |             writer.add_scalar('log/score', float(running_score), e)
117 |             writer.add_scalar('log/loss', float(loss), e)
118 | 
119 |         if running_score > goal_score:
120 |             break
121 | 
122 | 
123 | if __name__=="__main__":
124 |     main()
125 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch CartPole Example
 2 | Simple Cartpole example writed with pytorch.
 3 | 
 4 | ## Why Cartpole?
 5 | Cartpole is very easy problem and is converged very fast in many case.
 6 | So you can run this example in your computer(maybe it take just only 1~2 minitue).
 7 | 
 8 | ## Rainbow
 9 | - [x] DQN [[1]](#reference)
10 | - [x] Double [[2]](#reference)
11 | - [x] Duel [[3]](#reference)
12 | - [x] Multi-step [[4]](#reference)
13 | - [x] PER(Prioritized Experience Replay) [[5]](#reference)
14 | - [x] Nosiy-Net [[6]](#reference)
15 | - [x] Distributional(C51) [[7]](#reference)
16 | - [x] Rainbow [[8]](#reference)
17 | 
18 | ## PG(Policy Gradient)
19 | - [x] REINFORCE [[9]](#reference)
20 | - [x] Actor Critic [[10]](#reference)
21 | - [x] Advantage Actor Critic
22 | - [x] GAE(Generalized Advantage Estimation) [[12]](#reference)
23 | - [x] TNPG [[20]](#reference)
24 | - [x] TRPO [[13]](#reference)
25 | - [x] PPO - Single Version [[14]](#reference)
26 | 
27 | ## Parallel
28 | - [x] Asynchronous Q-learning [[11]](#reference)
29 | - [x] A3C (Asynchronous Advantage Actor Critic) [[11]](#reference)
30 | - [x] ACER [[21]](#reference)
31 | - [ ] PPO [[14]](#reference)
32 | - [x] APE-X DQN [[15]](#reference)
33 | - [ ] IMPALA [[23]](#reference)
34 | - [ ] R2D2 [[16]](#reference)
35 | 
36 | ## Distributional DQN
37 | - [x] QRDQN [[18]](#reference)
38 | - [x] IQN [[19]](#reference)
39 | 
40 | ## Exploration
41 | - [ ] ICM [[22]](#refercence)
42 | - [ ] RND [[17]](#reference)
43 | 
44 | ## POMDP (With RNN)
45 | - [x] DQN (use state stack)
46 | - [x] DRQN [[24]](#reference) [[25]](#reference)
47 | - [x] DRQN (use state stack)
48 | - [x] DRQN (store Rnn State) [[16]](#reference)
49 | - [x] R2D2 - Single Version [[16]](#reference)
50 | 
51 | 
52 | ## Reference
53 | [1][Playing Atari with Deep Reinforcement Learning](http://arxiv.org/abs/1312.5602)  
54 | [2][Deep Reinforcement Learning with Double Q-learning](http://arxiv.org/abs/1509.06461)  
55 | [3][Dueling Network Architectures for Deep Reinforcement Learning](http://arxiv.org/abs/1511.06581)  
56 | [4][Reinforcement Learning: An Introduction](http://www.incompleteideas.net/sutton/book/ebook/the-book.html)  
57 | [5][Prioritized Experience Replay](http://arxiv.org/abs/1511.05952)  
58 | [6][Noisy Networks for Exploration](https://arxiv.org/abs/1706.10295)  
59 | [7][A Distributional Perspective on Reinforcement Learning](https://arxiv.org/abs/1707.06887)  
60 | [8][Rainbow: Combining Improvements in Deep Reinforcement Learning](https://arxiv.org/abs/1710.02298)  
61 | [9][Policy Gradient Methods for Reinforcement Learning with Function Approximation ](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf)  
62 | [10][Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf)  
63 | [11][Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf)  
64 | [12][HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION](https://arxiv.org/pdf/1506.02438.pdf)  
65 | [13][Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf)  
66 | [14][Proximal Policy Optimization](https://arxiv.org/pdf/1707.06347.pdf)  
67 | [15][DISTRIBUTED PRIORITIZED EXPERIENCE REPLAY](https://arxiv.org/pdf/1803.00933.pdf)  
68 | [16][RECURRENT EXPERIENCE REPLAY IN DISTRIBUTED REINFORCEMENT LEARNING](https://openreview.net/pdf?id=r1lyTjAqYX)  
69 | [17][EXPLORATION BY RANDOM NETWORK DISTILLATION](https://openreview.net/pdf?id=H1lJJnR5Ym)  
70 | [18][Distributional Reinforcement Learning with Quantile Regression](https://arxiv.org/pdf/1710.10044.pdf)  
71 | [19][Implicit Quantile Networks for Distributional Reinforcement Learning](https://arxiv.org/pdf/1806.06923.pdf)  
72 | [20][A Natural Policy Gradient](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf)  
73 | [21][SAMPLE EFFICIENT ACTOR-CRITIC WITH EXPERIENCE REPLAY](https://arxiv.org/pdf/1611.01224.pdf)  
74 | [22][Curiosity-driven Exploration by Self-supervised Prediction](https://arxiv.org/pdf/1705.05363.pdf)  
75 | [23][IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures](https://arxiv.org/pdf/1802.01561.pdf)  
76 | [24][Deep Recurrent Q-Learning for Partially Observable MDPs](https://arxiv.org/pdf/1507.06527.pdf)  
77 | [25][Playing FPS Games with Deep Reinforcement Learning](https://arxiv.org/pdf/1609.05521.pdf)  
78 | 
79 | ## Acknowledgements
80 | - https://github.com/openai/baselines
81 | - https://github.com/reinforcement-learning-kr/pg_travel
82 | - https://github.com/reinforcement-learning-kr/distributional_rl
83 | - https://github.com/Kaixhin/Rainbow
84 | - https://github.com/Kaixhin/ACER
85 | - https://github.com/higgsfield/RL-Adventure-2
86 | 
87 | ## Use Cuda
88 | check this issue. https://github.com/g6ling/Reinforcement-Learning-Pytorch-Cartpole/issues/1
89 | 


--------------------------------------------------------------------------------
/distributional/1-QR-DQN/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32 
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | 
15 | num_support = 8
16 | 


--------------------------------------------------------------------------------
/distributional/1-QR-DQN/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | 
 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 6 | 
 7 | 
 8 | class Memory(object):
 9 |     def __init__(self, capacity):
10 |         self.memory = deque(maxlen=capacity)
11 |         self.capacity = capacity
12 | 
13 |     def push(self, state, next_state, action, reward, mask):
14 |         self.memory.append(Transition(state, next_state, action, reward, mask))
15 | 
16 |     def sample(self, batch_size):
17 |         transitions = random.sample(self.memory, batch_size)
18 |         batch = Transition(*zip(*transitions))
19 |         return batch
20 | 
21 |     def __len__(self):
22 |         return len(self.memory)
23 | 


--------------------------------------------------------------------------------
/distributional/1-QR-DQN/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | from config import num_support, batch_size, gamma
 7 | 
 8 | class QRDQN(nn.Module):
 9 |     def __init__(self, num_inputs, num_outputs):
10 |         super(QRDQN, self).__init__()
11 |         self.num_inputs = num_inputs
12 |         self.num_outputs = num_outputs
13 | 
14 |         self.num_support = num_support
15 | 
16 |         self.fc1 = nn.Linear(num_inputs, 128)
17 |         self.fc2 = nn.Linear(128, num_outputs * num_support)
18 | 
19 |         for m in self.modules():
20 |             if isinstance(m, nn.Linear):
21 |                 nn.init.xavier_uniform(m.weight)
22 |     
23 |     def forward(self, state):
24 |         x = F.relu(self.fc1(state))
25 |         x = self.fc2(x)
26 |         theta = x.view(-1, self.num_outputs, self.num_support)
27 |     
28 |         return theta
29 |     
30 |     def get_action(self, state):
31 |         theta = self.forward(state)
32 |         Q = theta.mean(dim=2, keepdim=True)
33 |         action = torch.argmax(Q)
34 |         return action.item()
35 | 
36 |     @classmethod
37 |     def train_model(cls, online_net, target_net, optimizer, batch):
38 |         states = torch.stack(batch.state)
39 |         next_states = torch.stack(batch.next_state)
40 |         actions = torch.Tensor(batch.action).long()
41 |         rewards = torch.Tensor(batch.reward)
42 |         masks = torch.Tensor(batch.mask)
43 | 
44 |         theta = online_net(states)
45 |         action = actions.unsqueeze(1).unsqueeze(1).expand(-1, 1, num_support)
46 |         theta_a = theta.gather(1, action).squeeze(1)
47 | 
48 |         next_theta = target_net(next_states) # batch_size * action * num_support
49 |         next_action = next_theta.mean(dim=2).max(1)[1] # batch_size
50 |         next_action = next_action.unsqueeze(1).unsqueeze(1).expand(batch_size, 1, num_support)
51 |         next_theta_a = next_theta.gather(1, next_action).squeeze(1) # batch_size * num_support
52 | 
53 |         T_theta = rewards.unsqueeze(1) + gamma * next_theta_a * masks.unsqueeze(1)
54 | 
55 |         T_theta_tile = T_theta.view(-1, num_support, 1).expand(-1, num_support, num_support)
56 |         theta_a_tile = theta_a.view(-1, 1, num_support).expand(-1, num_support, num_support)
57 |         
58 |         error_loss = T_theta_tile - theta_a_tile            
59 |         huber_loss = F.smooth_l1_loss(theta_a_tile, T_theta_tile.detach(), reduction='none')
60 |         tau = torch.arange(0.5 * (1 / num_support), 1, 1 / num_support).view(1, num_support)
61 |         
62 |         loss = (tau - (error_loss < 0).float()).abs() * huber_loss
63 |         loss = loss.mean(dim=2).sum(dim=1).mean()
64 | 
65 |         optimizer.zero_grad()
66 |         loss.backward()
67 |         optimizer.step()
68 | 
69 |         return loss


--------------------------------------------------------------------------------
/distributional/1-QR-DQN/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import argparse
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | from tensorboardX import SummaryWriter
 12 | 
 13 | from model import QRDQN
 14 | from memory import Memory
 15 | 
 16 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr
 17 | 
 18 | 
 19 | def get_action(state, target_net, epsilon, env):
 20 |     if np.random.rand() <= epsilon:
 21 |         return env.action_space.sample()
 22 |     else:
 23 |         return target_net.get_action(state)
 24 | 
 25 | def update_target_model(online_net, target_net):
 26 |     # Target <- Net
 27 |     target_net.load_state_dict(online_net.state_dict())
 28 | 
 29 | 
 30 | 
 31 | def main():
 32 |     env = gym.make(env_name)
 33 |     env.seed(500)
 34 |     torch.manual_seed(500)
 35 | 
 36 |     num_inputs = env.observation_space.shape[0]
 37 |     num_actions = env.action_space.n
 38 |     print('state size:', num_inputs)
 39 |     print('action size:', num_actions)
 40 | 
 41 |     online_net = QRDQN(num_inputs, num_actions)
 42 |     target_net = QRDQN(num_inputs, num_actions)
 43 |     update_target_model(online_net, target_net)
 44 | 
 45 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 46 |     writer = SummaryWriter('logs')
 47 | 
 48 |     online_net.to(device)
 49 |     target_net.to(device)
 50 |     online_net.train()
 51 |     target_net.train()
 52 |     memory = Memory(replay_memory_capacity)
 53 |     running_score = 0
 54 |     epsilon = 1.0
 55 |     steps = 0
 56 |     loss = 0
 57 | 
 58 |     for e in range(3000):
 59 |         done = False
 60 | 
 61 |         score = 0
 62 |         state = env.reset()
 63 |         state = torch.Tensor(state)
 64 |         state = state.unsqueeze(0)
 65 | 
 66 |         while not done:
 67 |             steps += 1
 68 |             action = get_action(state, target_net, epsilon, env)
 69 |             next_state, reward, done, _ = env.step(action)
 70 | 
 71 |             next_state = torch.Tensor(next_state)
 72 |             next_state = next_state.unsqueeze(0)
 73 | 
 74 |             mask = 0 if done else 1
 75 |             reward = reward if not done or score == 499 else -1
 76 |             memory.push(state, next_state, action, reward, mask)
 77 | 
 78 |             score += reward
 79 |             state = next_state
 80 | 
 81 |             if steps > initial_exploration:
 82 |                 epsilon -= 0.00005
 83 |                 epsilon = max(epsilon, 0.1)
 84 | 
 85 |                 batch = memory.sample(batch_size)
 86 |                 loss = QRDQN.train_model(online_net, target_net, optimizer, batch)
 87 | 
 88 |                 if steps % update_target == 0:
 89 |                     update_target_model(online_net, target_net)
 90 | 
 91 |         score = score if score == 500.0 else score + 1
 92 |         running_score = 0.99 * running_score + 0.01 * score
 93 |         if e % log_interval == 0:
 94 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
 95 |                 e, running_score, epsilon))
 96 |             writer.add_scalar('log/score', float(running_score), e)
 97 |             writer.add_scalar('log/loss', float(loss), e)
 98 | 
 99 |         if running_score > goal_score:
100 |             break
101 | 
102 | if __name__=="__main__":
103 |     main()
104 | 


--------------------------------------------------------------------------------
/distributional/2-IQN/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | 
15 | num_quantile_sample = 32
16 | num_tau_sample = 16
17 | num_tau_prime_sample = 8
18 | quantile_embedding_dim = 64


--------------------------------------------------------------------------------
/distributional/2-IQN/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | 
 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 6 | 
 7 | 
 8 | class Memory(object):
 9 |     def __init__(self, capacity):
10 |         self.memory = deque(maxlen=capacity)
11 |         self.capacity = capacity
12 | 
13 |     def push(self, state, next_state, action, reward, mask):
14 |         self.memory.append(Transition(state, next_state, action, reward, mask))
15 | 
16 |     def sample(self, batch_size):
17 |         transitions = random.sample(self.memory, batch_size)
18 |         batch = Transition(*zip(*transitions))
19 |         return batch
20 | 
21 |     def __len__(self):
22 |         return len(self.memory)
23 | 


--------------------------------------------------------------------------------
/distributional/2-IQN/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | from config import batch_size, gamma, quantile_embedding_dim, num_tau_sample, num_tau_prime_sample, num_quantile_sample
 7 | 
 8 | class IQN(nn.Module):
 9 |     def __init__(self, num_inputs, num_outputs):
10 |         super(IQN, self).__init__()
11 |         self.num_inputs = num_inputs
12 |         self.num_outputs = num_outputs
13 | 
14 |         self.fc1 = nn.Linear(num_inputs, 128)
15 |         self.fc2 = nn.Linear(128, num_outputs)
16 |         self.phi = nn.Linear(quantile_embedding_dim, 128)
17 | 
18 |         for m in self.modules():
19 |             if isinstance(m, nn.Linear):
20 |                 nn.init.xavier_uniform(m.weight)
21 | 
22 |     def forward(self, state, tau, num_quantiles):
23 |         input_size = state.size()[0] # batch_size(train) or 1(get_action)
24 |         tau = tau.expand(input_size * num_quantiles, quantile_embedding_dim)
25 |         pi_mtx = torch.Tensor(np.pi * np.arange(0, quantile_embedding_dim)).expand(input_size * num_quantiles, quantile_embedding_dim)
26 |         cos_tau = torch.cos(tau * pi_mtx)
27 | 
28 |         phi = self.phi(cos_tau)
29 |         phi = F.relu(phi)
30 | 
31 |         state_tile = state.expand(input_size, num_quantiles, self.num_inputs)
32 |         state_tile = state_tile.flatten().view(-1, self.num_inputs)
33 |         
34 |         x = F.relu(self.fc1(state_tile))
35 |         x = self.fc2(x * phi)
36 |         z = x.view(-1, num_quantiles, self.num_outputs)
37 | 
38 |         z = z.transpose(1, 2) # [input_size, num_output, num_quantile]
39 |         return z
40 | 
41 |     def get_action(self, state):
42 |         tau = torch.Tensor(np.random.rand(num_quantile_sample, 1) * 0.5) # CVaR
43 |         z = self.forward(state, tau, num_quantile_sample)
44 |         q = z.mean(dim=2, keepdim=True)
45 |         action = torch.argmax(q)
46 |         return action.item()
47 | 
48 |     @classmethod
49 |     def train_model(cls, online_net, target_net, optimizer, batch):
50 |         states = torch.stack(batch.state)
51 |         next_states = torch.stack(batch.next_state)
52 |         actions = torch.Tensor(batch.action).long()
53 |         rewards = torch.Tensor(batch.reward)
54 |         masks = torch.Tensor(batch.mask)
55 | 
56 |         tau = torch.Tensor(np.random.rand(batch_size * num_tau_sample, 1))
57 |         z = online_net(states, tau, num_tau_sample)
58 |         action = actions.unsqueeze(1).unsqueeze(1).expand(-1, 1, num_tau_sample)
59 |         z_a = z.gather(1, action).squeeze(1)
60 | 
61 |         tau_prime = torch.Tensor(np.random.rand(batch_size * num_tau_prime_sample, 1))
62 |         next_z = target_net(next_states, tau_prime, num_tau_prime_sample)
63 |         next_action = next_z.mean(dim=2).max(1)[1]
64 |         next_action = next_action.unsqueeze(1).unsqueeze(1).expand(batch_size, 1, num_tau_prime_sample)
65 |         next_z_a = next_z.gather(1, next_action).squeeze(1)
66 | 
67 |         T_z = rewards.unsqueeze(1) + gamma * next_z_a * masks.unsqueeze(1)
68 | 
69 |         T_z_tile = T_z.view(-1, num_tau_prime_sample, 1).expand(-1, num_tau_prime_sample, num_tau_sample)
70 |         z_a_tile = z_a.view(-1, 1, num_tau_sample).expand(-1, num_tau_prime_sample, num_tau_sample)
71 |         
72 |         error_loss = T_z_tile - z_a_tile
73 |         huber_loss = F.smooth_l1_loss(z_a_tile, T_z_tile.detach(), reduction='none')
74 |         tau = torch.arange(0, 1, 1 / num_tau_sample).view(1, num_tau_sample)
75 |         
76 |         loss = (tau - (error_loss < 0).float()).abs() * huber_loss
77 |         loss = loss.mean(dim=2).sum(dim=1).mean()
78 | 
79 |         optimizer.zero_grad()
80 |         loss.backward()
81 |         optimizer.step()
82 | 
83 |         return loss


--------------------------------------------------------------------------------
/distributional/2-IQN/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import argparse
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | from tensorboardX import SummaryWriter
 12 | 
 13 | from model import IQN
 14 | from memory import Memory
 15 | 
 16 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr
 17 | 
 18 | 
 19 | def get_action(state, target_net, epsilon, env):
 20 |     if np.random.rand() <= epsilon:
 21 |         return env.action_space.sample()
 22 |     else:
 23 |         return target_net.get_action(state)
 24 | 
 25 | def update_target_model(online_net, target_net):
 26 |     # Target <- Net
 27 |     target_net.load_state_dict(online_net.state_dict())
 28 | 
 29 | 
 30 | def main():
 31 |     env = gym.make(env_name)
 32 |     env.seed(500)
 33 |     torch.manual_seed(500)
 34 | 
 35 |     num_inputs = env.observation_space.shape[0]
 36 |     num_actions = env.action_space.n
 37 |     print('state size:', num_inputs)
 38 |     print('action size:', num_actions)
 39 | 
 40 |     online_net = IQN(num_inputs, num_actions)
 41 |     target_net = IQN(num_inputs, num_actions)
 42 |     update_target_model(online_net, target_net)
 43 | 
 44 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 45 |     writer = SummaryWriter('logs')
 46 | 
 47 |     online_net.to(device)
 48 |     target_net.to(device)
 49 |     online_net.train()
 50 |     target_net.train()
 51 |     memory = Memory(replay_memory_capacity)
 52 |     running_score = 0
 53 |     epsilon = 1.0
 54 |     steps = 0
 55 |     loss = 0
 56 | 
 57 |     for e in range(3000):
 58 |         done = False
 59 | 
 60 |         score = 0
 61 |         state = env.reset()
 62 |         state = torch.Tensor(state)
 63 |         state = state.unsqueeze(0)
 64 | 
 65 |         while not done:
 66 |             steps += 1
 67 |             action = get_action(state, target_net, epsilon, env)
 68 |             next_state, reward, done, _ = env.step(action)
 69 | 
 70 |             next_state = torch.Tensor(next_state)
 71 |             next_state = next_state.unsqueeze(0)
 72 | 
 73 |             mask = 0 if done else 1
 74 |             reward = reward if not done or score == 499 else -1
 75 |             memory.push(state, next_state, action, reward, mask)
 76 | 
 77 |             score += reward
 78 |             state = next_state
 79 | 
 80 |             if steps > initial_exploration:
 81 |                 epsilon -= 0.00005
 82 |                 epsilon = max(epsilon, 0.1)
 83 | 
 84 |                 batch = memory.sample(batch_size)
 85 |                 loss = IQN.train_model(online_net, target_net, optimizer, batch)
 86 | 
 87 |                 if steps % update_target == 0:
 88 |                     update_target_model(online_net, target_net)
 89 | 
 90 |         score = score if score == 500.0 else score + 1
 91 |         running_score = 0.99 * running_score + 0.01 * score
 92 |         if e % log_interval == 0:
 93 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
 94 |                 e, running_score, epsilon))
 95 |             writer.add_scalar('log/score', float(running_score), e)
 96 |             writer.add_scalar('log/loss', float(loss), e)
 97 | 
 98 |         if running_score > goal_score:
 99 |             break
100 | 
101 | if __name__=="__main__":
102 |     main()
103 | 


--------------------------------------------------------------------------------
/parallel/1-Async-Q-Learning/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | lr = 0.001
 6 | goal_score = 200
 7 | log_interval = 10
 8 | update_target = 100
 9 | async_update_step = 10
10 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11 | max_episode = 30000
12 | 


--------------------------------------------------------------------------------
/parallel/1-Async-Q-Learning/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 5 | 
 6 | 
 7 | class Memory(object):
 8 |     def __init__(self, capacity):
 9 |         self.memory = deque(maxlen=capacity)
10 |         self.capacity = capacity
11 | 
12 |     def push(self, state, next_state, action, reward, mask):
13 |         self.memory.append(Transition(state, next_state, action, reward, mask))
14 | 
15 |     def sample(self):
16 |         batch = Transition(*zip(*self.memory))
17 |         return batch
18 | 
19 |     def __len__(self):
20 |         return len(self.memory)
21 | 


--------------------------------------------------------------------------------
/parallel/1-Async-Q-Learning/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from config import gamma
 6 | 
 7 | class QNet(nn.Module):
 8 |     def __init__(self, num_inputs, num_outputs):
 9 |         super(QNet, self).__init__()
10 |         self.num_inputs = num_inputs
11 |         self.num_outputs = num_outputs
12 | 
13 |         self.fc1 = nn.Linear(num_inputs, 128)
14 |         self.fc2 = nn.Linear(128, num_outputs)
15 | 
16 |         for m in self.modules():
17 |             if isinstance(m, nn.Linear):
18 |                 nn.init.xavier_uniform(m.weight)
19 | 
20 |     def forward(self, x):
21 |         x = F.relu(self.fc1(x))
22 |         qvalue = self.fc2(x)
23 |         return qvalue
24 | 
25 |     @classmethod
26 |     def train_model(cls, online_net, target_net, optimizer, batch):
27 |         states = torch.stack(batch.state)
28 |         next_states = torch.stack(batch.next_state)
29 |         actions = torch.Tensor(batch.action).float()
30 |         rewards = torch.Tensor(batch.reward)
31 |         masks = torch.Tensor(batch.mask)
32 | 
33 |         pred = online_net(states).squeeze(1)
34 |         next_pred = target_net(next_states).squeeze(1)
35 | 
36 |         pred = torch.sum(pred.mul(actions), dim=1)
37 | 
38 |         target = rewards + masks * gamma * next_pred.max(1)[0]
39 | 
40 |         loss = torch.sum((pred - target.detach()) ** 2)
41 |         optimizer.zero_grad()
42 |         loss.backward()
43 |         optimizer.step()
44 | 
45 |         return loss
46 | 
47 |     def get_action(self, input):
48 |         qvalue = self.forward(input)
49 |         _, action = torch.max(qvalue, 1)
50 |         return action.numpy()[0]
51 | 


--------------------------------------------------------------------------------
/parallel/1-Async-Q-Learning/shared_adam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class SharedAdam(torch.optim.Adam): # extend a pytorch optimizer so it shares grads across processes
 4 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 5 |         super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
 6 |         for group in self.param_groups:
 7 |             for p in group['params']:
 8 |                 state = self.state[p]
 9 |                 state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0
10 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
11 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
12 | 
13 |         def step(self, closure=None):
14 |             for group in self.param_groups:
15 |                 for p in group['params']:
16 |                     if p.grad is None: continue
17 |                     self.state[p]['shared_steps'] += 1
18 |                     self.state[p]['step'] = self.state[p]['shared_steps'][0] - 1 # a "step += 1"  comes later
19 |             super.step(closure)
20 | 


--------------------------------------------------------------------------------
/parallel/1-Async-Q-Learning/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import gym
 4 | import random
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.optim as optim
 9 | import torch.nn.functional as F
10 | from model import QNet
11 | from worker import Worker
12 | from tensorboardX import SummaryWriter
13 | import torch.multiprocessing as mp
14 | from shared_adam import SharedAdam
15 | 
16 | from config import env_name, lr, device
17 | 
18 | 
19 | def main():
20 |     env = gym.make(env_name)
21 |     env.seed(500)
22 |     torch.manual_seed(500)
23 | 
24 |     num_inputs = env.observation_space.shape[0]
25 |     num_actions = env.action_space.n
26 |     print('state size:', num_inputs)
27 |     print('action size:', num_actions)
28 | 
29 |     online_net = QNet(num_inputs, num_actions)
30 |     target_net = QNet(num_inputs, num_actions)
31 |     target_net.load_state_dict(online_net.state_dict())
32 |     online_net.share_memory()
33 |     target_net.share_memory()
34 | 
35 |     optimizer = SharedAdam(online_net.parameters(), lr=lr)
36 |     global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue()
37 | 
38 |     writer = SummaryWriter('logs')
39 | 
40 |     online_net.to(device)
41 |     target_net.to(device)
42 |     online_net.train()
43 |     target_net.train()
44 | 
45 |     workers = [Worker(online_net, target_net, optimizer, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count())]
46 |     [w.start() for w in workers]
47 |     res = []
48 |     while True:
49 |         r = res_queue.get()
50 |         if r is not None:
51 |             res.append(r)
52 |             [ep, ep_r, loss] = r
53 |             writer.add_scalar('log/score', float(ep_r), ep)
54 |             writer.add_scalar('log/loss', float(loss), ep)
55 |         else:
56 |             break
57 |     [w.join() for w in workers]
58 | 
59 | 
60 | if __name__=="__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/parallel/1-Async-Q-Learning/worker.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import torch.multiprocessing as mp
 4 | import numpy as np
 5 | from model import QNet
 6 | from memory import Memory
 7 | 
 8 | from config import env_name, async_update_step, update_target, max_episode, device, log_interval, goal_score
 9 | 
10 | class Worker(mp.Process):
11 |     def __init__(self, online_net, target_net, optimizer, global_ep, global_ep_r, res_queue, name):
12 |         super(Worker, self).__init__()
13 | 
14 |         self.env = gym.make(env_name)
15 |         self.env.seed(500)
16 | 
17 |         self.name = 'w%i' % name
18 |         self.global_ep, self.global_ep_r, self.res_queue = global_ep, global_ep_r, res_queue
19 |         self.online_net, self.target_net, self.optimizer = online_net, target_net, optimizer
20 | 
21 |     def record(self, score, epsilon, loss):
22 |         with self.global_ep.get_lock():
23 |             self.global_ep.value += 1
24 |         with self.global_ep_r.get_lock():
25 |             if self.global_ep_r.value == 0.:
26 |                 self.global_ep_r.value = score
27 |             else:
28 |                 self.global_ep_r.value = 0.99 * self.global_ep_r.value + 0.01 * score
29 |         if self.global_ep.value % log_interval == 0:
30 |             print('{} , {} episode | score: {:.2f}, | epsilon: {:.2f}'.format(
31 |                 self.name, self.global_ep.value, self.global_ep_r.value, epsilon))
32 | 
33 |         self.res_queue.put([self.global_ep.value, self.global_ep_r.value, loss])
34 | 
35 | 
36 |     def update_target_model(self):
37 |         self.target_net.load_state_dict(self.online_net.state_dict())
38 | 
39 |     def get_action(self, state, epsilon):
40 |         if np.random.rand() <= epsilon:
41 |             return self.env.action_space.sample()
42 |         else:
43 |             return self.target_net.get_action(state)
44 | 
45 |     def run(self):
46 |         epsilon = 1.0
47 |         steps = 0
48 |         while self.global_ep.value < max_episode:
49 |             if self.global_ep_r.value > goal_score:
50 |                 break
51 |             done = False
52 | 
53 |             score = 0
54 |             state = self.env.reset()
55 |             state = torch.Tensor(state).to(device)
56 |             state = state.unsqueeze(0)
57 | 
58 |             memory = Memory(async_update_step)
59 | 
60 |             while not done:
61 |                 steps += 1
62 | 
63 |                 action = self.get_action(state, epsilon)
64 |                 next_state, reward, done, _ = self.env.step(action)
65 | 
66 |                 next_state = torch.Tensor(next_state)
67 |                 next_state = next_state.unsqueeze(0)
68 | 
69 |                 mask = 0 if done else 1
70 |                 reward = reward if not done or score == 499 else -1
71 |                 action_one_hot = np.zeros(2)
72 |                 action_one_hot[action] = 1
73 |                 memory.push(state, next_state, action_one_hot, reward, mask)
74 | 
75 |                 score += reward
76 |                 state = next_state
77 | 
78 |                 epsilon -= 0.00001
79 |                 epsilon = max(epsilon, 0.1)
80 | 
81 |                 if len(memory) == async_update_step or done:
82 |                     batch = memory.sample()
83 |                     loss = QNet.train_model(self.online_net, self.target_net, self.optimizer, batch)
84 |                     memory = Memory(async_update_step)
85 |                     if done:
86 |                         self.record(score, epsilon, loss)
87 |                         break
88 |                 if steps % update_target == 0:
89 |                     self.update_target_model()
90 | 
91 |             score = score if score == 500.0 else score + 1
92 | 
93 |         self.res_queue.put(None)
94 | 


--------------------------------------------------------------------------------
/parallel/2-A3C/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | lr = 0.0001
 6 | goal_score = 200
 7 | log_interval = 10
 8 | n_step = 10
 9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10 | max_episode = 30000
11 | 


--------------------------------------------------------------------------------
/parallel/2-A3C/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple
 3 | 
 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 5 | 
 6 | 
 7 | class Memory(object):
 8 |     def __init__(self, capacity):
 9 |         self.memory = []
10 |         self.capacity = capacity
11 |         self.position = 0
12 | 
13 |     def push(self, state, next_state, action, reward, mask):
14 |         """Saves a transition."""
15 |         if len(self.memory) < self.capacity:
16 |             self.memory.append(Transition(state, next_state, action, reward, mask))
17 |         self.memory[self.position] = Transition(state, next_state, action, reward, mask)
18 |         self.position = (self.position + 1) % self.capacity
19 | 
20 |     def sample(self):
21 |         transitions = self.memory
22 |         batch = Transition(*zip(*transitions))
23 |         return batch
24 | 
25 |     def __len__(self):
26 |         return len(self.memory)
27 | 


--------------------------------------------------------------------------------
/parallel/2-A3C/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from config import gamma
 6 | 
 7 | class Model(nn.Module):
 8 |     def __init__(self, num_inputs, num_outputs):
 9 |         super(Model, self).__init__()
10 |         self.num_inputs = num_inputs
11 |         self.num_outputs = num_outputs
12 | 
13 |         self.fc = nn.Linear(num_inputs, 128)
14 |         self.fc_actor = nn.Linear(128, num_outputs)
15 |         self.fc_critic = nn.Linear(128, 1)
16 | 
17 |         for m in self.modules():
18 |             if isinstance(m, nn.Linear):
19 |                 nn.init.xavier_uniform(m.weight)
20 | 
21 |     def forward(self, input):
22 |         x = F.relu(self.fc(input))
23 |         policy = F.softmax(self.fc_actor(x))
24 |         value = self.fc_critic(x)
25 |         return policy, value
26 | 
27 |     def get_action(self, input):
28 |         policy, _ = self.forward(input)
29 |         policy = policy[0].data.numpy()
30 | 
31 |         action = np.random.choice(self.num_outputs, 1, p=policy)[0]
32 |         return action
33 | 
34 | 
35 | class GlobalModel(Model):
36 |     def __init__(self, num_inputs, num_outputs):
37 |         super(GlobalModel, self).__init__(num_inputs, num_outputs)
38 | 
39 | 
40 | class LocalModel(Model):
41 |     def __init__(self, num_inputs, num_outputs):
42 |         super(LocalModel, self).__init__(num_inputs, num_outputs)
43 | 
44 |     def push_to_global_model(self, batch, global_model, global_optimizer):
45 |         states = torch.stack(batch.state)
46 |         next_states = torch.stack(batch.next_state)
47 |         actions = torch.stack(batch.action)
48 |         rewards = torch.Tensor(batch.reward)
49 |         masks = torch.Tensor(batch.mask)
50 | 
51 |         policy, value = self.forward(states)
52 |         policy = policy.view(-1, self.num_outputs)
53 |         value = value.view(-1)
54 | 
55 |         _, last_value = self.forward(next_states[-1])
56 | 
57 |         running_return = last_value[0].data
58 |         running_returns = torch.zeros(rewards.size())
59 |         for t in reversed(range(0, len(rewards))):
60 |             running_return = rewards[t] + gamma * running_return * masks[t]
61 |             running_returns[t] = running_return
62 | 
63 | 
64 |         td_error = running_returns - value.detach()
65 |         log_policy = (torch.log(policy + 1e-10) * actions).sum(dim=1, keepdim=True)
66 |         loss_policy = - log_policy * td_error
67 |         loss_value = torch.pow(td_error, 2)
68 |         entropy = (torch.log(policy + 1e-10) * policy).sum(dim=1, keepdim=True)
69 | 
70 |         loss = (loss_policy + loss_value - 0.01 * entropy).mean()
71 | 
72 |         global_optimizer.zero_grad()
73 |         loss.backward()
74 |         for lp, gp in zip(self.parameters(), global_model.parameters()):
75 |             gp._grad = lp.grad
76 |         global_optimizer.step()
77 | 
78 |         return loss
79 | 
80 |     def pull_from_global_model(self, global_model):
81 |         self.load_state_dict(global_model.state_dict())
82 | 


--------------------------------------------------------------------------------
/parallel/2-A3C/shared_adam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | class SharedAdam(torch.optim.Adam): # extend a pytorch optimizer so it shares grads across processes
 3 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 4 |         super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
 5 |         for group in self.param_groups:
 6 |             for p in group['params']:
 7 |                 state = self.state[p]
 8 |                 state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0
 9 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
10 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
11 | 
12 |         def step(self, closure=None):
13 |             for group in self.param_groups:
14 |                 for p in group['params']:
15 |                     if p.grad is None: continue
16 |                     self.state[p]['shared_steps'] += 1
17 |                     self.state[p]['step'] = self.state[p]['shared_steps'][0] - 1 # a "step += 1"  comes later
18 |             super.step(closure)
19 | 
20 | # class SharedAdam(torch.optim.Adam):
21 | #     def __init__(self, params, lr=1e-3, betas=(0.9, 0.9), eps=1e-8,
22 | #                  weight_decay=0):
23 | #         super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
24 | #         # State initialization
25 | #         for group in self.param_groups:
26 | #             for p in group['params']:
27 | #                 state = self.state[p]
28 | #                 state['step'] = 0
29 | #                 state['exp_avg'] = torch.zeros_like(p.data)
30 | #                 state['exp_avg_sq'] = torch.zeros_like(p.data)
31 | #
32 | #                 # share in memory
33 | #                 state['exp_avg'].share_memory_()
34 | #                 state['exp_avg_sq'].share_memory_()
35 | 


--------------------------------------------------------------------------------
/parallel/2-A3C/train.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | 
 4 | from model import Model
 5 | from worker import Worker
 6 | from shared_adam import SharedAdam
 7 | from tensorboardX import SummaryWriter
 8 | import torch.multiprocessing as mp
 9 | 
10 | from config import env_name, lr
11 | 
12 | def main():
13 |     env = gym.make(env_name)
14 |     env.seed(500)
15 |     torch.manual_seed(500)
16 | 
17 |     num_inputs = env.observation_space.shape[0]
18 |     num_actions = env.action_space.n
19 |     global_model = Model(num_inputs, num_actions)
20 |     global_model.share_memory()
21 |     global_optimizer = SharedAdam(global_model.parameters(), lr=lr)
22 |     global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue()
23 | 
24 |     writer = SummaryWriter('logs')
25 | 
26 |     workers = [Worker(global_model, global_optimizer, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count())]
27 |     [w.start() for w in workers]
28 |     res = []
29 |     while True:
30 |         r = res_queue.get()
31 |         if r is not None:
32 |             res.append(r)
33 |             [ep, ep_r, loss] = r
34 |             writer.add_scalar('log/score', float(ep_r), ep)
35 |             writer.add_scalar('log/loss', float(loss), ep)
36 |         else:
37 |             break
38 |     [w.join() for w in workers]
39 | 
40 | if __name__=="__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/parallel/2-A3C/worker.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import torch.multiprocessing as mp
 4 | import numpy as np
 5 | from model import LocalModel
 6 | from memory import Memory
 7 | from config import env_name, n_step, max_episode, log_interval
 8 | 
 9 | class Worker(mp.Process):
10 |     def __init__(self, global_model, global_optimizer, global_ep, global_ep_r, res_queue, name):
11 |         super(Worker, self).__init__()
12 | 
13 |         self.env = gym.make(env_name)
14 |         self.env.seed(500)
15 | 
16 |         self.name = 'w%i' % name
17 |         self.global_ep, self.global_ep_r, self.res_queue = global_ep, global_ep_r, res_queue
18 |         self.global_model, self.global_optimizer = global_model, global_optimizer
19 |         self.local_model = LocalModel(self.env.observation_space.shape[0], self.env.action_space.n)
20 |         self.num_actions = self.env.action_space.n
21 | 
22 |     def record(self, score, loss):
23 |         with self.global_ep.get_lock():
24 |             self.global_ep.value += 1
25 |         with self.global_ep_r.get_lock():
26 |             if self.global_ep_r.value == 0.:
27 |                 self.global_ep_r.value = score
28 |             else:
29 |                 self.global_ep_r.value = 0.99 * self.global_ep_r.value + 0.01 * score
30 |         if self.global_ep.value % log_interval == 0:
31 |             print('{} , {} episode | score: {:.2f}'.format(
32 |                 self.name, self.global_ep.value, self.global_ep_r.value))
33 | 
34 |         self.res_queue.put([self.global_ep.value, self.global_ep_r.value, loss])
35 | 
36 |     def get_action(self, policy, num_actions):
37 |         policy = policy.data.numpy()[0]
38 |         action = np.random.choice(num_actions, 1, p=policy)[0]
39 |         return action
40 | 
41 |     def run(self):
42 | 
43 |         while self.global_ep.value < max_episode:
44 |             self.local_model.pull_from_global_model(self.global_model)
45 |             done = False
46 |             score = 0
47 |             steps = 0
48 | 
49 |             state = self.env.reset()
50 |             state = torch.Tensor(state)
51 |             state = state.unsqueeze(0)
52 |             memory = Memory(n_step)
53 | 
54 |             while True:
55 |                 policy, value = self.local_model(state)
56 |                 action = self.get_action(policy, self.num_actions)
57 | 
58 |                 next_state, reward, done, _ = self.env.step(action)
59 |                 next_state = torch.Tensor(next_state)
60 |                 next_state = next_state.unsqueeze(0)
61 | 
62 |                 mask = 0 if done else 1
63 |                 reward = reward if not done or score == 499 else -1
64 |                 action_one_hot = torch.zeros(2)
65 |                 action_one_hot[action] = 1
66 |                 memory.push(state, next_state, action_one_hot, reward, mask)
67 | 
68 |                 score += reward
69 |                 state = next_state
70 | 
71 |                 if len(memory) == n_step or done:
72 |                     batch = memory.sample()
73 |                     loss = self.local_model.push_to_global_model(batch, self.global_model, self.global_optimizer)
74 |                     self.local_model.pull_from_global_model(self.global_model)
75 |                     memory = Memory(n_step)
76 | 
77 |                     if done:
78 |                         running_score = self.record(score, loss)
79 |                         break
80 | 
81 | 
82 |         self.res_queue.put(None)
83 | 


--------------------------------------------------------------------------------
/parallel/3-ACER/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | lr = 0.001
 6 | goal_score = 200
 7 | log_interval = 10
 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 9 | max_episode = 30000
10 | 
11 | 
12 | replay_memory_capacity = 1000
13 | truncation_clip = 10
14 | delta = 1
15 | trust_region_decay = 0.99
16 | replay_ratio = 4
17 | max_gradient_norm = 40
18 | 


--------------------------------------------------------------------------------
/parallel/3-ACER/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask', 'policy'))
 5 | 
 6 | 
 7 | class Memory(object):
 8 |     def __init__(self, capacity):
 9 |         self.memory = deque(maxlen=capacity)
10 |         self.capacity = capacity
11 | 
12 |     def push(self, trajectory):
13 |         self.memory.append(trajectory.trajectory)
14 | 
15 |     def sample(self):
16 |         trajectory = self.memory[random.randrange(len(self.memory))]
17 |         return Transition(*zip(*trajectory))
18 | 
19 |     def __len__(self):
20 |         return len(self.memory)
21 | 
22 | class Trajectory(object):
23 |     def __init__(self):
24 |         self.trajectory = []
25 | 
26 |     def push(self, state, next_state, action, reward, mask, policy):
27 |         self.trajectory.append(Transition(state, next_state, action, reward, mask, policy))
28 | 
29 |     def sample(self):
30 |         trajectory = self.trajectory
31 |         return Transition(*zip(*trajectory))
32 | 
33 |     def __len__(self):
34 |         return len(self.trajectory)
35 | 


--------------------------------------------------------------------------------
/parallel/3-ACER/shared_adam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | class SharedAdam(torch.optim.Adam): # extend a pytorch optimizer so it shares grads across processes
 3 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 4 |         super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
 5 |         for group in self.param_groups:
 6 |             for p in group['params']:
 7 |                 state = self.state[p]
 8 |                 state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0
 9 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
10 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
11 | 
12 |         def step(self, closure=None):
13 |             for group in self.param_groups:
14 |                 for p in group['params']:
15 |                     if p.grad is None: continue
16 |                     self.state[p]['shared_steps'] += 1
17 |                     self.state[p]['step'] = self.state[p]['shared_steps'][0] - 1 # a "step += 1"  comes later
18 |             super.step(closure)
19 | 
20 | # class SharedAdam(torch.optim.Adam):
21 | #     def __init__(self, params, lr=1e-3, betas=(0.9, 0.9), eps=1e-8,
22 | #                  weight_decay=0):
23 | #         super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
24 | #         # State initialization
25 | #         for group in self.param_groups:
26 | #             for p in group['params']:
27 | #                 state = self.state[p]
28 | #                 state['step'] = 0
29 | #                 state['exp_avg'] = torch.zeros_like(p.data)
30 | #                 state['exp_avg_sq'] = torch.zeros_like(p.data)
31 | #
32 | #                 # share in memory
33 | #                 state['exp_avg'].share_memory_()
34 | #                 state['exp_avg_sq'].share_memory_()
35 | 


--------------------------------------------------------------------------------
/parallel/3-ACER/train.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | 
 4 | from model import Model
 5 | from worker import Worker
 6 | from shared_adam import SharedAdam
 7 | from tensorboardX import SummaryWriter
 8 | import torch.multiprocessing as mp
 9 | 
10 | from config import env_name, lr
11 | 
12 | def main():
13 |     env = gym.make(env_name)
14 |     env.seed(500)
15 |     torch.manual_seed(500)
16 | 
17 |     num_inputs = env.observation_space.shape[0]
18 |     num_actions = env.action_space.n
19 |     env.close()
20 | 
21 |     global_model = Model(num_inputs, num_actions)
22 |     global_average_model = Model(num_inputs, num_actions)
23 |     global_model.share_memory()
24 |     global_average_model.share_memory()
25 |     global_optimizer = SharedAdam(global_model.parameters(), lr=lr)
26 |     global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue()
27 | 
28 |     writer = SummaryWriter('logs')
29 | 
30 |     n = mp.cpu_count()
31 |     workers = [Worker(global_model, global_average_model, global_optimizer, global_ep, global_ep_r, res_queue, i) for i in range(n)]
32 |     [w.start() for w in workers]
33 |     res = []
34 |     while True:
35 |         r = res_queue.get()
36 |         if r is not None:
37 |             res.append(r)
38 |             [ep, ep_r, loss] = r
39 |             writer.add_scalar('log/score', float(ep_r), ep)
40 |             writer.add_scalar('log/loss', float(loss), ep)
41 |         else:
42 |             break
43 |     [w.join() for w in workers]
44 | 
45 | if __name__=="__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/parallel/3-ACER/worker.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import torch.multiprocessing as mp
 4 | import numpy as np
 5 | from model import LocalModel
 6 | from memory import Memory, Trajectory
 7 | from config import env_name, max_episode, log_interval, replay_memory_capacity, replay_ratio
 8 | 
 9 | class Worker(mp.Process):
10 |     def __init__(self, global_model, global_average_model, global_optimizer, global_ep, global_ep_r, res_queue, name):
11 |         super(Worker, self).__init__()
12 | 
13 |         self.env = gym.make(env_name)
14 |         self.env.seed(500)
15 | 
16 |         self.name = 'w%i' % name
17 |         self.global_ep, self.global_ep_r, self.res_queue = global_ep, global_ep_r, res_queue
18 |         self.global_model, self.global_average_model, self.global_optimizer = global_model, global_average_model, global_optimizer
19 |         self.local_model = LocalModel(self.env.observation_space.shape[0], self.env.action_space.n)
20 |         self.num_actions = self.env.action_space.n
21 | 
22 |         self.memory = Memory(replay_memory_capacity)
23 | 
24 |     def record(self, score, loss):
25 |         with self.global_ep.get_lock():
26 |             self.global_ep.value += 1
27 |         with self.global_ep_r.get_lock():
28 |             if self.global_ep_r.value == 0.:
29 |                 self.global_ep_r.value = score
30 |             else:
31 |                 self.global_ep_r.value = 0.99 * self.global_ep_r.value + 0.01 * score
32 |         if self.global_ep.value % log_interval == 0:
33 |             print('{} , {} episode | score: {:.2f}'.format(
34 |                 self.name, self.global_ep.value, self.global_ep_r.value))
35 | 
36 |         self.res_queue.put([self.global_ep.value, self.global_ep_r.value, loss])
37 | 
38 |     def run(self):
39 |         while self.global_ep.value < max_episode:
40 |             self.algorithm(True)
41 |             n = np.random.poisson(replay_ratio)
42 |             for _ in range(n):
43 |                 self.algorithm(False)
44 | 
45 |     def algorithm(self, on_policy):
46 |         self.local_model.pull_from_global_model(self.global_model)
47 |         if not on_policy and len(self.memory) > 100:
48 |             trajectory = self.memory.sample()
49 |         else:
50 |             trajectory, score = self.run_env()
51 |         loss = self.local_model.train(on_policy, trajectory, self.global_average_model, self.global_optimizer, self.global_model, self.global_average_model)
52 |         if on_policy:
53 |             self.record(score, loss)
54 | 
55 | 
56 |     def run_env(self):
57 |         done = False
58 |         score = 0
59 |         steps = 0
60 | 
61 |         state = self.env.reset()
62 |         state = torch.Tensor(state)
63 |         state = state.unsqueeze(0)
64 |         trajectory = Trajectory()
65 | 
66 |         while True:
67 |             action, policy = self.local_model.get_action(state)
68 |             policy = torch.Tensor(policy)
69 | 
70 |             next_state, reward, done, _ = self.env.step(action)
71 |             next_state = torch.Tensor(next_state)
72 |             next_state = next_state.unsqueeze(0)
73 | 
74 |             mask = 0 if done else 1
75 |             reward = reward if not done or score == 499 else -1
76 |             trajectory.push(state, next_state, action, reward, mask, policy)
77 | 
78 |             score += reward
79 |             state = next_state
80 | 
81 |             if done:
82 |                 break
83 | 
84 |         self.memory.push(trajectory)
85 |         trajectory = trajectory.sample()
86 |         return trajectory, score
87 | 


--------------------------------------------------------------------------------
/parallel/5-ApeX/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | lr = 0.002
 6 | goal_score = 200
 7 | log_interval = 10
 8 | max_episode = 30000
 9 | 
10 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11 | 
12 | 
13 | 
14 | replay_memory_capacity = 10000
15 | n_step = 3 
16 | local_mini_batch = 32
17 | batch_size = 32
18 | alpha = 0.5
19 | beta = 0.4


--------------------------------------------------------------------------------
/parallel/5-ApeX/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import torch
 4 | from collections import namedtuple, deque
 5 | 
 6 | from config import gamma, batch_size, alpha, beta
 7 | 
 8 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask', 'step'))
 9 | 
10 | class N_Step_Buffer(object):
11 |     def __init__(self): 
12 |         self.memory = []
13 |         self.step = 0
14 | 
15 |     def push(self, state, next_state, action, reward, mask):
16 |         self.step += 1
17 |         self.memory.append([state, next_state, action, reward, mask])        
18 | 
19 |     def sample(self):
20 |         [state, _, action, _, _] = self.memory[0]
21 |         [_, next_state, _, _, mask] = self.memory[-1]
22 | 
23 |         sum_reward = 0
24 |         for t in reversed(range(len(self.memory))):
25 |             [_, _, _, reward, _] = self.memory[t]
26 |             sum_reward += reward + gamma * sum_reward
27 |         reward = sum_reward
28 |         step = self.step
29 |         self.reset()
30 | 
31 |         return [state, next_state, action, reward, mask, step]
32 | 
33 |     def reset(self):
34 |         self.memory = []
35 |         self.step = 0
36 |     
37 |     def __len__(self):
38 |         return len(self.memory)
39 | 
40 | 
41 | class LocalBuffer(object):
42 |     def __init__(self):
43 |         self.memory = []
44 |     
45 |     def push(self, state, next_state, action, reward, mask, step):
46 |         self.memory.append(Transition(state, next_state, action, reward, mask, step))
47 |     
48 |     def sample(self):
49 |         transitions = self.memory
50 |         batch = Transition(*zip(*transitions))
51 |         return batch
52 |     
53 |     def reset(self):
54 |         self.memory = []
55 |     
56 |     def __len__(self):
57 |         return len(self.memory)
58 | 
59 | class Memory(object):
60 |     def __init__(self, capacity):
61 |         self.capacity = capacity
62 |         self.memory = deque(maxlen=capacity)
63 |         self.memory_probability = deque(maxlen=capacity)
64 |     
65 |     def push(self, state, next_state, action, reward, mask, step, prior):
66 |         self.memory.append(Transition(state, next_state, action, reward, mask, step))
67 |         self.memory_probability.append(prior)
68 | 
69 |     def sample(self):
70 |         probaility = torch.Tensor(self.memory_probability)
71 |         probaility = probaility.pow(alpha)
72 |         probaility = probaility / probaility.sum()
73 | 
74 |         p = probaility.numpy()
75 | 
76 |         indexes = np.random.choice(range(len(self.memory_probability)), batch_size, p=p)
77 |         
78 |         transitions = [self.memory[idx] for idx in indexes]
79 |         transitions_p = torch.Tensor([self.memory_probability[idx] for idx in indexes])
80 |         
81 |         batch = Transition(*zip(*transitions))
82 | 
83 |         weights = (self.capacity * transitions_p).pow(-beta)
84 |         weights = weights / weights.max()
85 | 
86 |         return indexes, batch, weights
87 | 
88 |     def update_prior(self, indexes, priors):
89 |         priors_idx = 0
90 |         for idx in indexes:
91 |             self.memory_probability[idx] = priors[priors_idx]
92 |             priors_idx += 1
93 |     
94 |     def __len__(self):
95 |         return len(self.memory)
96 |         
97 |     
98 |     


--------------------------------------------------------------------------------
/parallel/5-ApeX/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | class Model(nn.Module):
 7 |     def __init__(self, num_inputs, num_outputs):
 8 |         super(Model, self).__init__()
 9 |         self.num_inputs = num_inputs
10 |         self.num_outputs = num_outputs
11 | 
12 |         self.fc = nn.Linear(num_inputs, 128)
13 |         self.fc_adv = nn.Linear(128, num_outputs)
14 |         self.fc_val = nn.Linear(128, 1)
15 | 
16 |         for m in self.modules():
17 |             if isinstance(m, nn.Linear):
18 |                 nn.init.xavier_uniform(m.weight)
19 | 
20 |     def forward(self, x):
21 |         x = F.relu(self.fc(x))
22 |         adv = self.fc_adv(x)
23 |         adv = adv.view(-1, self.num_outputs)
24 |         val = self.fc_val(x)
25 |         val = val.view(-1, 1)
26 | 
27 |         qvalue = val + (adv - adv.mean(dim=1, keepdim=True))
28 |         return qvalue
29 | 
30 | class LocalModel(Model):
31 |     def __init__(self, num_inputs, num_outputs):
32 |         super(LocalModel, self).__init__(num_inputs, num_outputs)
33 | 
34 |     def pull_from_global_model(self, global_model):
35 |         self.load_state_dict(global_model.state_dict())
36 | 
37 |     def get_action(self, input):
38 |         qvalue = self.forward(input)
39 |         _, action = torch.max(qvalue, 1)
40 |         return action.numpy()[0]


--------------------------------------------------------------------------------
/parallel/5-ApeX/train.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | 
 4 | from model import Model
 5 | from worker import Actor, Learner
 6 | import torch.multiprocessing as mp
 7 | from tensorboardX import SummaryWriter
 8 | 
 9 | from memory import Memory
10 | from config import env_name, lr, replay_memory_capacity
11 | 
12 | def main():
13 |     env = gym.make(env_name)
14 |     env.seed(500)
15 |     torch.manual_seed(500)
16 | 
17 |     num_inputs = env.observation_space.shape[0]
18 |     num_actions = env.action_space.n
19 |     env.close()
20 | 
21 |     global_target_model = Model(num_inputs, num_actions)
22 |     global_online_model = Model(num_inputs, num_actions)
23 |     global_target_model.train()
24 |     global_online_model.train()
25 |     
26 |     global_target_model.load_state_dict(global_online_model.state_dict())
27 |     global_target_model.share_memory()
28 |     global_online_model.share_memory()
29 |     
30 |     global_memory = Memory(replay_memory_capacity)
31 |     
32 |     
33 |     global_ep, global_ep_r, res_queue, global_memory_pipe = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue(), mp.Queue()
34 | 
35 |     writer = SummaryWriter('logs')
36 | 
37 |     n = 2 
38 |     epsilons = [(i * 0.05 + 0.1) for i in range(n)]
39 | 
40 |     actors = [Actor(global_target_model, global_memory_pipe, global_ep, global_ep_r, epsilons[i], i) for i in range(n)]
41 |     [w.start() for w in actors]
42 |     learner = Learner(global_online_model, global_target_model, global_memory, global_memory_pipe, res_queue)
43 |     learner.start()
44 | 
45 |     res = []
46 |     while True:
47 |         r = res_queue.get()
48 |         if r is not None:
49 |             res.append(r)
50 |             [ep, loss] = r
51 |             # writer.add_scalar('log/score', float(ep_r), ep)
52 |             writer.add_scalar('log/loss', float(loss), ep)
53 |         else:
54 |             break
55 |     [w.join() for w in actors]
56 | 
57 | if __name__=="__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/rainbow/1-dqn/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 


--------------------------------------------------------------------------------
/rainbow/1-dqn/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 5 | 
 6 | 
 7 | class Memory(object):
 8 |     def __init__(self, capacity):
 9 |         self.memory = deque(maxlen=capacity)
10 |         self.capacity = capacity
11 | 
12 |     def push(self, state, next_state, action, reward, mask):
13 |         self.memory.append(Transition(state, next_state, action, reward, mask))
14 | 
15 |     def sample(self, batch_size):
16 |         transitions = random.sample(self.memory, batch_size)
17 |         batch = Transition(*zip(*transitions))
18 |         return batch
19 | 
20 |     def __len__(self):
21 |         return len(self.memory)
22 | 


--------------------------------------------------------------------------------
/rainbow/1-dqn/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from config import gamma
 6 | class QNet(nn.Module):
 7 |     def __init__(self, num_inputs, num_outputs):
 8 |         super(QNet, self).__init__()
 9 |         self.num_inputs = num_inputs
10 |         self.num_outputs = num_outputs
11 | 
12 |         self.fc1 = nn.Linear(num_inputs, 128)
13 |         self.fc2 = nn.Linear(128, num_outputs)
14 | 
15 |         for m in self.modules():
16 |             if isinstance(m, nn.Linear):
17 |                 nn.init.xavier_uniform(m.weight)
18 | 
19 |     def forward(self, x):
20 |         x = F.relu(self.fc1(x))
21 |         qvalue = self.fc2(x)
22 |         return qvalue
23 | 
24 |     @classmethod
25 |     def train_model(cls, online_net, target_net, optimizer, batch):
26 |         states = torch.stack(batch.state)
27 |         next_states = torch.stack(batch.next_state)
28 |         actions = torch.Tensor(batch.action).float()
29 |         rewards = torch.Tensor(batch.reward)
30 |         masks = torch.Tensor(batch.mask)
31 | 
32 |         pred = online_net(states).squeeze(1)
33 |         next_pred = target_net(next_states).squeeze(1)
34 | 
35 |         pred = torch.sum(pred.mul(actions), dim=1)
36 | 
37 |         target = rewards + masks * gamma * next_pred.max(1)[0]
38 | 
39 |         loss = F.mse_loss(pred, target.detach())
40 |         optimizer.zero_grad()
41 |         loss.backward()
42 |         optimizer.step()
43 | 
44 |         return loss
45 | 
46 |     def get_action(self, input):
47 |         qvalue = self.forward(input)
48 |         _, action = torch.max(qvalue, 1)
49 |         return action.numpy()[0]
50 | 


--------------------------------------------------------------------------------
/rainbow/1-dqn/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | import torch.nn.functional as F
 10 | from model import QNet
 11 | from memory import Memory
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr
 15 | 
 16 | 
 17 | def get_action(state, target_net, epsilon, env):
 18 |     if np.random.rand() <= epsilon:
 19 |         return env.action_space.sample()
 20 |     else:
 21 |         return target_net.get_action(state)
 22 | 
 23 | def update_target_model(online_net, target_net):
 24 |     # Target <- Net
 25 |     target_net.load_state_dict(online_net.state_dict())
 26 | 
 27 | 
 28 | def main():
 29 |     env = gym.make(env_name)
 30 |     env.seed(500)
 31 |     torch.manual_seed(500)
 32 | 
 33 |     num_inputs = env.observation_space.shape[0]
 34 |     num_actions = env.action_space.n
 35 |     print('state size:', num_inputs)
 36 |     print('action size:', num_actions)
 37 | 
 38 |     online_net = QNet(num_inputs, num_actions)
 39 |     target_net = QNet(num_inputs, num_actions)
 40 |     update_target_model(online_net, target_net)
 41 | 
 42 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 43 |     writer = SummaryWriter('logs')
 44 | 
 45 |     online_net.to(device)
 46 |     target_net.to(device)
 47 |     online_net.train()
 48 |     target_net.train()
 49 |     memory = Memory(replay_memory_capacity)
 50 |     running_score = 0
 51 |     epsilon = 1.0
 52 |     steps = 0
 53 |     loss = 0
 54 | 
 55 |     for e in range(3000):
 56 |         done = False
 57 | 
 58 |         score = 0
 59 |         state = env.reset()
 60 |         state = torch.Tensor(state).to(device)
 61 |         state = state.unsqueeze(0)
 62 | 
 63 |         while not done:
 64 |             steps += 1
 65 | 
 66 |             action = get_action(state, target_net, epsilon, env)
 67 |             next_state, reward, done, _ = env.step(action)
 68 | 
 69 |             next_state = torch.Tensor(next_state)
 70 |             next_state = next_state.unsqueeze(0)
 71 | 
 72 |             mask = 0 if done else 1
 73 |             reward = reward if not done or score == 499 else -1
 74 |             action_one_hot = np.zeros(2)
 75 |             action_one_hot[action] = 1
 76 |             memory.push(state, next_state, action_one_hot, reward, mask)
 77 | 
 78 |             score += reward
 79 |             state = next_state
 80 | 
 81 |             if steps > initial_exploration:
 82 |                 epsilon -= 0.00005
 83 |                 epsilon = max(epsilon, 0.1)
 84 | 
 85 |                 batch = memory.sample(batch_size)
 86 |                 loss = QNet.train_model(online_net, target_net, optimizer, batch)
 87 | 
 88 |                 if steps % update_target == 0:
 89 |                     update_target_model(online_net, target_net)
 90 | 
 91 |         score = score if score == 500.0 else score + 1
 92 |         running_score = 0.99 * running_score + 0.01 * score
 93 |         if e % log_interval == 0:
 94 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
 95 |                 e, running_score, epsilon))
 96 |             writer.add_scalar('log/score', float(running_score), e)
 97 |             writer.add_scalar('log/loss', float(loss), e)
 98 | 
 99 |         if running_score > goal_score:
100 |             break
101 | 
102 | 
103 | if __name__=="__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/rainbow/2-DoubleDQN/README-KR.md:
--------------------------------------------------------------------------------
 1 | # Double DQN
 2 | 
 3 | Last Edited: Nov 19, 2018 6:06 PM
 4 | Tags: RL
 5 | 
 6 | ## 논문
 7 | 
 8 | double: [https://arxiv.org/pdf/1509.06461.pdf](https://arxiv.org/pdf/1509.06461.pdf)
 9 | 
10 | duel: [https://arxiv.org/pdf/1511.06581.pdf](https://arxiv.org/pdf/1511.06581.pdf)
11 | 
12 | ## Double
13 | 
14 | 그냥 DQN 식
15 | 
16 | $$loss = (Q(s,a) - r + \gamma Q'(s, argmax_{a'}Q'(s,a'))^2$$
17 | 
18 | Double DQN 식
19 | 
20 | $$loss = (Q(s,a) - r + \gamma Q'(s, argma_{a'}Q(s,a'))^2$$
21 | 
22 | Action 선택을 `target_net` 으로 하는지 `main_net` 으로 하는지의 차이만 있을 뿐이다.
23 | 
24 | DQN에서는 단순하게 `target_net`으로 Action을 선택했는데 이 경우에는 만약 `target_net`이 가장 큰 `qvalue`을 가지고 있는 `action`을 선택하면 그 `action`이 다시 `Q-value`을 증가 시키고 다시 그 `action`이 선택 되는 순환이 발생 할 수 있기 때문에 `action` 을 선택하는 `net` 과 `value` 을 평가하는 `net` 을 분리시킨다.
25 | 
26 | ## 구현
27 | 
28 | ```python
29 | def train_model(cls, net, target_net, optimizer, batch, batch_size):
30 |         states = torch.stack(batch.state)
31 |         next_states = torch.stack(batch.next_state)
32 |         actions = torch.Tensor(batch.action).float()
33 |         rewards = torch.Tensor(batch.reward)
34 |         masks = torch.Tensor(batch.mask)
35 | 
36 |         pred = net(states).squeeze(1)
37 |         _, action_from_net = net(next_states).squeeze(1).max(1)
38 |         next_pred = target_net(next_states).squeeze(1)
39 | 
40 |         pred = torch.sum(pred.mul(actions), dim=1)
41 | 
42 |         target = rewards + masks * gamma * next_pred.gather(1, action_from_net.unsqueeze(1)).squeeze(1)
43 | ```
44 | 
45 | 
46 | ​    
47 | ```python
48 |         loss = F.mse_loss(pred, target.detach())
49 |         optimizer.zero_grad()
50 |         loss.backward()
51 |         optimizer.step()
52 | ```
53 | 
54 | 일단 `action` 을 `net` 의 `max` 로 구한다. 그 뒤 `target_net` 에서 그 `action` 값에 맞는 `Q-value`을 사용한다.


--------------------------------------------------------------------------------
/rainbow/2-DoubleDQN/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 


--------------------------------------------------------------------------------
/rainbow/2-DoubleDQN/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple
 3 | 
 4 | 
 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 6 | 
 7 | 
 8 | class Memory(object):
 9 |     def __init__(self, capacity):
10 |         self.memory = []
11 |         self.capacity = capacity
12 |         self.position = 0
13 | 
14 |     def push(self, state, next_state, action, reward, mask):
15 |         """Saves a transition."""
16 |         if len(self.memory) < self.capacity:
17 |             self.memory.append(Transition(state, next_state, action, reward, mask))
18 |         self.memory[self.position] = Transition(state, next_state, action, reward, mask)
19 |         self.position = (self.position + 1) % self.capacity
20 | 
21 |     def sample(self, batch_size):
22 |         transitions = random.sample(self.memory, batch_size)
23 |         batch = Transition(*zip(*transitions))
24 |         return batch
25 | 
26 |     def __len__(self):
27 |         return len(self.memory)
28 | 


--------------------------------------------------------------------------------
/rainbow/2-DoubleDQN/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from config import gamma
 5 | 
 6 | class DoubleDQNet(nn.Module):
 7 |     def __init__(self, num_inputs, num_outputs):
 8 |         super(DoubleDQNet, self).__init__()
 9 |         self.num_inputs = num_inputs
10 |         self.num_outputs = num_outputs
11 | 
12 |         self.fc1 = nn.Linear(num_inputs, 128)
13 |         self.fc2 = nn.Linear(128, num_outputs)
14 | 
15 |         for m in self.modules():
16 |             if isinstance(m, nn.Linear):
17 |                 nn.init.xavier_uniform(m.weight)
18 | 
19 |     def forward(self, x):
20 |         x = F.relu(self.fc1(x))
21 |         qvalue = self.fc2(x)
22 |         return qvalue
23 | 
24 |     @classmethod
25 |     def train_model(cls, online_net, target_net, optimizer, batch):
26 |         states = torch.stack(batch.state)
27 |         next_states = torch.stack(batch.next_state)
28 |         actions = torch.Tensor(batch.action).float()
29 |         rewards = torch.Tensor(batch.reward)
30 |         masks = torch.Tensor(batch.mask)
31 | 
32 |         pred = online_net(states).squeeze(1)
33 |         _, action_from_online_net = online_net(next_states).squeeze(1).max(1)
34 |         next_pred = target_net(next_states).squeeze(1)
35 | 
36 |         pred = torch.sum(pred.mul(actions), dim=1)
37 | 
38 |         target = rewards + masks * gamma * next_pred.gather(1, action_from_online_net.unsqueeze(1)).squeeze(1)
39 | 
40 | 
41 |         loss = F.mse_loss(pred, target.detach())
42 |         optimizer.zero_grad()
43 |         loss.backward()
44 |         optimizer.step()
45 | 
46 |         return loss
47 | 
48 |     def get_action(self, input):
49 |         qvalue = self.forward(input)
50 |         _, action = torch.max(qvalue, 1)
51 |         return action.numpy()[0]
52 | 


--------------------------------------------------------------------------------
/rainbow/2-DoubleDQN/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import argparse
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | from model import DoubleDQNet
 12 | from memory import Memory
 13 | from tensorboardX import SummaryWriter
 14 | from config import env_name, gamma, initial_exploration, batch_size, update_target, log_interval, goal_score, device, replay_memory_capacity, lr
 15 | 
 16 | 
 17 | def get_action(state, target_net, epsilon, env):
 18 |     if np.random.rand() <= epsilon:
 19 |         return env.action_space.sample()
 20 |     else:
 21 |         return target_net.get_action(state)
 22 | 
 23 | 
 24 | def update_target_model(online_net, target_net):
 25 |     # Target -> Net
 26 |     target_net.load_state_dict(online_net.state_dict())
 27 | 
 28 | 
 29 | def main():
 30 |     env = gym.make(env_name)
 31 |     env.seed(500)
 32 |     torch.manual_seed(500)
 33 | 
 34 |     num_inputs = env.observation_space.shape[0]
 35 |     num_actions = env.action_space.n
 36 |     print('state size:', num_inputs)
 37 |     print('action size:', num_actions)
 38 | 
 39 |     online_net = DoubleDQNet(num_inputs, num_actions)
 40 |     target_net = DoubleDQNet(num_inputs, num_actions)
 41 |     update_target_model(online_net, target_net)
 42 | 
 43 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 44 |     writer = SummaryWriter('logs')
 45 | 
 46 |     online_net.to(device)
 47 |     target_net.to(device)
 48 |     online_net.train()
 49 |     target_net.train()
 50 |     memory = Memory(replay_memory_capacity)
 51 |     running_score = 0
 52 |     epsilon = 1.0
 53 |     steps = 0
 54 |     loss = 0
 55 | 
 56 |     for e in range(3000):
 57 |         done = False
 58 | 
 59 |         score = 0
 60 |         state = env.reset()
 61 |         state = torch.Tensor(state).to(device)
 62 |         state = state.unsqueeze(0)
 63 | 
 64 |         while not done:
 65 |             steps += 1
 66 |             action = get_action(state, target_net, epsilon, env)
 67 |             next_state, reward, done, _ = env.step(action)
 68 | 
 69 |             next_state = torch.Tensor(next_state)
 70 |             next_state = next_state.unsqueeze(0)
 71 | 
 72 |             mask = 0 if done else 1
 73 |             reward = reward if not done or score == 499 else -1
 74 |             action_one_hot = np.zeros(2)
 75 |             action_one_hot[action] = 1
 76 |             memory.push(state, next_state, action_one_hot, reward, mask)
 77 | 
 78 |             score += reward
 79 |             state = next_state
 80 | 
 81 |             if steps > initial_exploration:
 82 |                 epsilon -= 0.00005
 83 |                 epsilon = max(epsilon, 0.1)
 84 | 
 85 |                 batch = memory.sample(batch_size)
 86 |                 loss = DoubleDQNet.train_model(online_net, target_net, optimizer, batch)
 87 | 
 88 |                 if steps % update_target == 0:
 89 |                     update_target_model(online_net, target_net)
 90 | 
 91 |         score = score if score == 500.0 else score + 1
 92 |         running_score = 0.99 * running_score + 0.01 * score
 93 |         if e % log_interval == 0:
 94 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
 95 |                 e, running_score, epsilon))
 96 |             writer.add_scalar('log/score', float(running_score), e)
 97 |             writer.add_scalar('log/loss', float(loss), e)
 98 | 
 99 |         if running_score > goal_score:
100 |             break
101 | 
102 | 
103 | if __name__=="__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/rainbow/3-DuelDQN/README-KR.md:
--------------------------------------------------------------------------------
 1 | # Duel DQN
 2 | 
 3 | Last Edited: Nov 19, 2018 6:06 PM
 4 | Tags: RL
 5 | 
 6 | ## Duel
 7 | 
 8 | $$Q(s,a) = V(s) + A(s,a)$$
 9 | 
10 | `Q` 함수를 `V`와 `A`로 분리한다. `A`는 즉 현재의 상태에 대한 기댓값과 행동에 대한 기댓값을 분리한다. 이로써 어떠한 행동에 대한 기댓값을 좀더 잘 추측 할 수 있다. 
11 | 
12 | ![](Screenshot2018-11-1519-f88e4bf3-f581-4b24-a07e-af467a4bba64.14.23.png)
13 | 
14 | 구하는 방법은 단순하게 layer 을 하나 더 만들어서 `A`을 output으로 하게 하면된다.
15 | 
16 | 하지만 단순하게 위의 식대로 할 경우 논문에서는 하나의 `Q` 에 대해서 `V`와 `S`가 unique 하지 않게 되고 그것 때문에 성능이 안 좋아진다고 한다. 그래서 
17 | 
18 | $$Q(s,a) = V(s) + (A(s,a) - max_{a'}A(s,a'))$$
19 | 
20 | 로 식을 바꾼다. 이로써 Q 가 수렴 되었을 때는 항상 가장 좋은 액션만 선택한다고 되었을 때이고 뒤의 `A` 항이 전부다 0이 되기 때문에 하나의 Q 값에 대해서 V와 A에 대한 항 부분을 unique 하게 찾을 수 있다.
21 | 
22 | $$Q(s,a) = V(s) + (A(s,a) - Avg(A))$$
23 | 
24 | 그리고 다시 식을 위처럼 바꾸는데, 위의 식처럼 할 경우 `A`항의 부분이 0이 되는게 더 빨리 수렴이 되기 때문에 더 빠르게 학습이 된다고 한다. 위의 경우는 max 로 수렴, 밑의 부분은 평균으로 수렴.
25 | 
26 | ```python
27 | class DuelDQNet(nn.Module):
28 |     def __init__(self, num_inputs, num_outputs):
29 |         super(DuelDQNet, self).__init__()
30 |         self.num_inputs = num_inputs
31 |         self.num_outputs = num_outputs
32 | 
33 |         self.fc = nn.Linear(num_inputs, 128)
34 |         self.fc_adv = nn.Linear(128, num_outputs)
35 |         self.fc_val = nn.Linear(128, 1)
36 | 
37 |         for m in self.modules():
38 |             if isinstance(m, nn.Linear):
39 |                 nn.init.xavier_uniform(m.weight)
40 | 
41 |     def forward(self, x):
42 |         x = F.relu(self.fc(x))
43 |         adv = self.fc_adv(x)
44 |         val = self.fc_val(x)
45 | 
46 |         qvalue = val + (adv - adv.mean())
47 |         return qvalue
48 | ```
49 | 
50 | 실제 구현에서는 이렇게 간단하게 `adv`와 `val` 을 나눠서 마지막에 `qvalue`을 만들어 주는 것만으로 구현이 가능하다.


--------------------------------------------------------------------------------
/rainbow/3-DuelDQN/Screenshot2018-11-1519-f88e4bf3-f581-4b24-a07e-af467a4bba64.14.23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choru-k/Reinforcement-Learning-Pytorch-Cartpole/ecb7b622cfefe825ac95388cceb6752413d90a2a/rainbow/3-DuelDQN/Screenshot2018-11-1519-f88e4bf3-f581-4b24-a07e-af467a4bba64.14.23.png


--------------------------------------------------------------------------------
/rainbow/3-DuelDQN/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 


--------------------------------------------------------------------------------
/rainbow/3-DuelDQN/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple
 3 | 
 4 | 
 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 6 | 
 7 | 
 8 | class Memory(object):
 9 |     def __init__(self, capacity):
10 |         self.memory = []
11 |         self.capacity = capacity
12 |         self.position = 0
13 | 
14 |     def push(self, state, next_state, action, reward, mask):
15 |         """Saves a transition."""
16 |         if len(self.memory) < self.capacity:
17 |             self.memory.append(Transition(state, next_state, action, reward, mask))
18 |         self.memory[self.position] = Transition(state, next_state, action, reward, mask)
19 |         self.position = (self.position + 1) % self.capacity
20 | 
21 |     def sample(self, batch_size):
22 |         transitions = random.sample(self.memory, batch_size)
23 |         batch = Transition(*zip(*transitions))
24 |         return batch
25 | 
26 |     def __len__(self):
27 |         return len(self.memory)
28 | 


--------------------------------------------------------------------------------
/rainbow/3-DuelDQN/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from config import gamma
 5 | 
 6 | class DuelDQNet(nn.Module):
 7 |     def __init__(self, num_inputs, num_outputs):
 8 |         super(DuelDQNet, self).__init__()
 9 |         self.num_inputs = num_inputs
10 |         self.num_outputs = num_outputs
11 | 
12 |         self.fc = nn.Linear(num_inputs, 128)
13 |         self.fc_adv = nn.Linear(128, num_outputs)
14 |         self.fc_val = nn.Linear(128, 1)
15 | 
16 |         for m in self.modules():
17 |             if isinstance(m, nn.Linear):
18 |                 nn.init.xavier_uniform(m.weight)
19 | 
20 |     def forward(self, x):
21 |         x = F.relu(self.fc(x))
22 |         adv = self.fc_adv(x)
23 |         adv = adv.view(-1, self.num_outputs)
24 |         val = self.fc_val(x)
25 |         val = val.view(-1, 1)
26 | 
27 |         qvalue = val + (adv - adv.mean(dim=1, keepdim=True))
28 |         return qvalue
29 | 
30 |     @classmethod
31 |     def train_model(cls, online_net, target_net, optimizer, batch):
32 |         states = torch.stack(batch.state)
33 |         next_states = torch.stack(batch.next_state)
34 |         actions = torch.Tensor(batch.action).float()
35 |         rewards = torch.Tensor(batch.reward)
36 |         masks = torch.Tensor(batch.mask)
37 | 
38 |         pred = online_net(states).squeeze(1)
39 |         next_pred = target_net(next_states).squeeze(1)
40 | 
41 |         pred = torch.sum(pred.mul(actions), dim=1)
42 | 
43 |         target = rewards + masks * gamma * next_pred.max(1)[0]
44 | 
45 | 
46 |         loss = F.mse_loss(pred, target.detach())
47 |         optimizer.zero_grad()
48 |         loss.backward()
49 |         optimizer.step()
50 | 
51 |         return loss
52 | 
53 |     def get_action(self, input):
54 |         qvalue = self.forward(input)
55 |         _, action = torch.max(qvalue, 1)
56 |         return action.numpy()[0]
57 | 


--------------------------------------------------------------------------------
/rainbow/3-DuelDQN/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import argparse
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | from model import DuelDQNet
 12 | from memory import Memory
 13 | from tensorboardX import SummaryWriter
 14 | from config import env_name, gamma, initial_exploration, batch_size, update_target, log_interval, goal_score, device, replay_memory_capacity, lr
 15 | 
 16 | 
 17 | def get_action(state, target_net, epsilon, env):
 18 |     if np.random.rand() <= epsilon:
 19 |         return env.action_space.sample()
 20 |     else:
 21 |         return target_net.get_action(state)
 22 | 
 23 | 
 24 | 
 25 | def update_target_model(online_net, target_net):
 26 |     # Target -> Net
 27 |     target_net.load_state_dict(online_net.state_dict())
 28 | 
 29 | 
 30 | def main():
 31 |     env = gym.make(env_name)
 32 |     env.seed(500)
 33 |     torch.manual_seed(500)
 34 | 
 35 |     num_inputs = env.observation_space.shape[0]
 36 |     num_actions = env.action_space.n
 37 |     print('state size:', num_inputs)
 38 |     print('action size:', num_actions)
 39 | 
 40 |     online_net = DuelDQNet(num_inputs, num_actions)
 41 |     target_net = DuelDQNet(num_inputs, num_actions)
 42 |     update_target_model(online_net, target_net)
 43 | 
 44 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 45 |     writer = SummaryWriter('logs')
 46 | 
 47 |     online_net.to(device)
 48 |     target_net.to(device)
 49 |     online_net.train()
 50 |     target_net.train()
 51 |     memory = Memory(replay_memory_capacity)
 52 |     running_score = 0
 53 |     epsilon = 1.0
 54 |     steps = 0
 55 |     loss = 0
 56 | 
 57 |     for e in range(3000):
 58 |         done = False
 59 | 
 60 |         score = 0
 61 |         state = env.reset()
 62 |         state = torch.Tensor(state).to(device)
 63 |         state = state.unsqueeze(0)
 64 | 
 65 |         while not done:
 66 |             steps += 1
 67 |             action = get_action(state, target_net, epsilon, env)
 68 |             next_state, reward, done, _ = env.step(action)
 69 | 
 70 |             next_state = torch.Tensor(next_state)
 71 |             next_state = next_state.unsqueeze(0)
 72 | 
 73 |             mask = 0 if done else 1
 74 |             reward = reward if not done or score == 499 else -1
 75 |             action_one_hot = np.zeros(2)
 76 |             action_one_hot[action] = 1
 77 |             memory.push(state, next_state, action_one_hot, reward, mask)
 78 | 
 79 |             score += reward
 80 |             state = next_state
 81 | 
 82 |             if steps > initial_exploration:
 83 |                 epsilon -= 0.00005
 84 |                 epsilon = max(epsilon, 0.1)
 85 | 
 86 |                 batch = memory.sample(batch_size)
 87 |                 loss = DuelDQNet.train_model(online_net, target_net, optimizer, batch)
 88 | 
 89 |                 if steps % update_target == 0:
 90 |                     update_target_model(online_net, target_net)
 91 | 
 92 |         score = score if score == 500.0 else score + 1
 93 |         running_score = 0.99 * running_score + 0.01 * score
 94 |         if e % log_interval == 0:
 95 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
 96 |                 e, running_score, epsilon))
 97 |             writer.add_scalar('log/score', float(running_score), e)
 98 |             writer.add_scalar('log/loss', float(loss), e)
 99 | 
100 |         if running_score > goal_score:
101 |             break
102 | 
103 | 
104 | if __name__=="__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/rainbow/4-multistep/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12 | replay_memory_capacity = 1000
13 | 
14 | 
15 | n_step = 3
16 | 


--------------------------------------------------------------------------------
/rainbow/4-multistep/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | from config import n_step, gamma
 4 | 
 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 6 | 
 7 | 
 8 | class Memory(object):
 9 |     def __init__(self, capacity):
10 |         self.memory = deque(maxlen=capacity)
11 |         self.capacity = capacity
12 |         self.reset_local()
13 | 
14 |     def reset_local(self):
15 |         self.local_step = 0
16 |         self.local_state = None
17 |         self.local_action = None
18 |         self.local_rewards = []
19 | 
20 |     def push(self, state, next_state, action, reward, mask):
21 |         self.local_step += 1
22 |         self.local_rewards.append(reward)
23 |         if self.local_step == 1:
24 |             self.local_state = state
25 |             self.local_action = action
26 |         if self.local_step == n_step:
27 |             reward = 0
28 |             for idx, local_reward in enumerate(self.local_rewards):
29 |                 reward += (gamma ** idx) * local_reward
30 |             self.memory.append(Transition(self.local_state, next_state, self.local_action, reward, mask))
31 |             self.reset_local()
32 |         if mask == 0:
33 |             self.reset_local()
34 | 
35 |     def sample(self, batch_size):
36 |         transitions = random.sample(self.memory, batch_size)
37 |         batch = Transition(*zip(*transitions))
38 |         return batch
39 | 
40 |     def __len__(self):
41 |         return len(self.memory)
42 | 


--------------------------------------------------------------------------------
/rainbow/4-multistep/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from config import gamma, n_step
 6 | class QNet(nn.Module):
 7 |     def __init__(self, num_inputs, num_outputs):
 8 |         super(QNet, self).__init__()
 9 |         self.num_inputs = num_inputs
10 |         self.num_outputs = num_outputs
11 | 
12 |         self.fc1 = nn.Linear(num_inputs, 128)
13 |         self.fc2 = nn.Linear(128, num_outputs)
14 | 
15 |         for m in self.modules():
16 |             if isinstance(m, nn.Linear):
17 |                 nn.init.xavier_uniform(m.weight)
18 | 
19 |     def forward(self, x):
20 |         x = F.relu(self.fc1(x))
21 |         qvalue = self.fc2(x)
22 |         return qvalue
23 | 
24 |     @classmethod
25 |     def train_model(cls, online_net, target_net, optimizer, batch):
26 |         states = torch.stack(batch.state)
27 |         next_states = torch.stack(batch.next_state)
28 |         actions = torch.Tensor(batch.action).float()
29 |         rewards = torch.Tensor(batch.reward)
30 |         masks = torch.Tensor(batch.mask)
31 | 
32 |         pred = online_net(states).squeeze(1)
33 |         next_pred = target_net(next_states).squeeze(1)
34 | 
35 |         pred = torch.sum(pred.mul(actions), dim=1)
36 | 
37 |         target = rewards + masks * (gamma ** n_step) * next_pred.max(1)[0]
38 | 
39 |         loss = F.mse_loss(pred, target.detach())
40 |         optimizer.zero_grad()
41 |         loss.backward()
42 |         optimizer.step()
43 | 
44 |         return loss
45 | 
46 |     def get_action(self, input):
47 |         qvalue = self.forward(input)
48 |         _, action = torch.max(qvalue, 1)
49 |         return action.numpy()[0]
50 | 


--------------------------------------------------------------------------------
/rainbow/4-multistep/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | import torch.nn.functional as F
 10 | from model import QNet
 11 | from memory import Memory
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr
 15 | 
 16 | 
 17 | def get_action(state, target_net, epsilon, env):
 18 |     if np.random.rand() <= epsilon:
 19 |         return env.action_space.sample()
 20 |     else:
 21 |         return target_net.get_action(state)
 22 | 
 23 | 
 24 | 
 25 | def update_target_model(online_net, target_net):
 26 |     # Target <- Net
 27 |     target_net.load_state_dict(online_net.state_dict())
 28 | 
 29 | 
 30 | def main():
 31 |     env = gym.make(env_name)
 32 |     env.seed(500)
 33 |     torch.manual_seed(500)
 34 | 
 35 |     num_inputs = env.observation_space.shape[0]
 36 |     num_actions = env.action_space.n
 37 |     print('state size:', num_inputs)
 38 |     print('action size:', num_actions)
 39 | 
 40 |     online_net = QNet(num_inputs, num_actions)
 41 |     target_net = QNet(num_inputs, num_actions)
 42 |     update_target_model(online_net, target_net)
 43 | 
 44 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 45 |     writer = SummaryWriter('logs')
 46 | 
 47 |     online_net.to(device)
 48 |     target_net.to(device)
 49 |     online_net.train()
 50 |     target_net.train()
 51 |     memory = Memory(replay_memory_capacity)
 52 |     running_score = 0
 53 |     epsilon = 1.0
 54 |     steps = 0
 55 |     loss = 0
 56 | 
 57 |     for e in range(3000):
 58 |         done = False
 59 | 
 60 |         score = 0
 61 |         state = env.reset()
 62 |         state = torch.Tensor(state).to(device)
 63 |         state = state.unsqueeze(0)
 64 | 
 65 |         while not done:
 66 | 
 67 |             steps += 1
 68 | 
 69 |             action = get_action(state, target_net, epsilon, env)
 70 |             next_state, reward, done, _ = env.step(action)
 71 | 
 72 |             next_state = torch.Tensor(next_state)
 73 |             next_state = next_state.unsqueeze(0)
 74 | 
 75 |             mask = 0 if done else 1
 76 |             reward = reward if not done or score == 499 else -1
 77 |             action_one_hot = np.zeros(2)
 78 |             action_one_hot[action] = 1
 79 |             memory.push(state, next_state, action_one_hot, reward, mask)
 80 | 
 81 |             score += reward
 82 |             state = next_state
 83 | 
 84 |             if steps > initial_exploration:
 85 |                 epsilon -= 0.00005
 86 |                 epsilon = max(epsilon, 0.1)
 87 | 
 88 |                 batch = memory.sample(batch_size)
 89 |                 loss = QNet.train_model(online_net, target_net, optimizer, batch)
 90 | 
 91 |                 if steps % update_target == 0:
 92 |                     update_target_model(online_net, target_net)
 93 | 
 94 |         score = score if score == 500.0 else score + 1
 95 |         running_score = 0.99 * running_score + 0.01 * score
 96 |         if e % log_interval == 0:
 97 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
 98 |                 e, running_score, epsilon))
 99 |             writer.add_scalar('log/score', float(running_score), e)
100 |             writer.add_scalar('log/loss', float(loss), e)
101 | 
102 |         if running_score > goal_score:
103 |             break
104 | 
105 | 
106 | if __name__=="__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/rainbow/5-per/Screenshot2018-11-1514-a431e580-fd9d-4a07-afd1-5f80e0042c23.45.16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choru-k/Reinforcement-Learning-Pytorch-Cartpole/ecb7b622cfefe825ac95388cceb6752413d90a2a/rainbow/5-per/Screenshot2018-11-1514-a431e580-fd9d-4a07-afd1-5f80e0042c23.45.16.png


--------------------------------------------------------------------------------
/rainbow/5-per/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | small_epsilon = 0.0001
15 | alpha = 0.5
16 | beta_start = 0.1
17 | 


--------------------------------------------------------------------------------
/rainbow/5-per/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | from collections import namedtuple, deque
 4 | import torch
 5 | from model import QNet
 6 | from config import small_epsilon, gamma, alpha, device
 7 | 
 8 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 9 | 
10 | 
11 | class Memory_With_TDError(object):
12 |     def __init__(self, capacity):
13 |         self.memory = deque(maxlen=capacity)
14 |         self.memory_probabiliy = deque(maxlen=capacity)
15 |         self.capacity = capacity
16 | 
17 |     def push(self, state, next_state, action, reward, mask):
18 |         """Saves a transition."""
19 |         if len(self.memory) > 0:
20 |             max_probability = max(self.memory_probabiliy)
21 |         else:
22 |             max_probability = small_epsilon
23 |         self.memory.append(Transition(state, next_state, action, reward, mask))
24 |         self.memory_probabiliy.append(max_probability)
25 | 
26 |     def sample(self, batch_size, net, target_net, beta):
27 |         probability_sum = sum(self.memory_probabiliy)
28 |         p = [probability / probability_sum for probability in self.memory_probabiliy]
29 |         # print(len(self.memory_probabiliy))
30 |         indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p)
31 |         transitions = [self.memory[idx] for idx in indexes]
32 |         transitions_p = [p[idx] for idx in indexes]
33 |         batch = Transition(*zip(*transitions))
34 | 
35 |         weights = [pow(self.capacity * p_j, -beta) for p_j in transitions_p]
36 |         weights = torch.Tensor(weights).to(device)
37 |         # print(weights)
38 |         weights = weights / weights.max()
39 |         # print(weights)
40 | 
41 |         td_error = QNet.get_td_error(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask)
42 | 
43 |         td_error_idx = 0
44 |         for idx in indexes:
45 |             self.memory_probabiliy[idx] = pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item()
46 |             # print(pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item())
47 |             td_error_idx += 1
48 | 
49 | 
50 |         return batch, weights
51 | 
52 |     def __len__(self):
53 |         return len(self.memory)
54 | 


--------------------------------------------------------------------------------
/rainbow/5-per/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from config import gamma
 5 | 
 6 | class QNet(nn.Module):
 7 |     def __init__(self, num_inputs, num_outputs):
 8 |         super(QNet, self).__init__()
 9 |         self.num_inputs = num_inputs
10 |         self.num_outputs = num_outputs
11 | 
12 |         self.fc1 = nn.Linear(num_inputs, 128)
13 |         self.fc2 = nn.Linear(128, num_outputs)
14 | 
15 |         for m in self.modules():
16 |             if isinstance(m, nn.Linear):
17 |                 nn.init.xavier_uniform(m.weight)
18 | 
19 |     def forward(self, x):
20 |         x = F.relu(self.fc1(x))
21 |         qvalue = self.fc2(x)
22 |         return qvalue
23 | 
24 |     @classmethod
25 |     def get_td_error(cls, online_net, target_net, state, next_state, action, reward, mask):
26 |         state = torch.stack(state)
27 |         next_state = torch.stack(next_state)
28 |         action = torch.Tensor(action)
29 |         reward = torch.Tensor(reward)
30 |         mask = torch.Tensor(mask)
31 | 
32 |         pred = online_net(state).squeeze(1)
33 |         next_pred = target_net(next_state).squeeze(1)
34 | 
35 |         pred = torch.sum(pred.mul(action), dim=1)
36 | 
37 |         target = reward + mask * gamma * next_pred.max(1)[0]
38 | 
39 |         td_error = pred - target.detach()
40 | 
41 |         return td_error
42 | 
43 |     @classmethod
44 |     def train_model(cls, online_net, target_net, optimizer, batch, weights):
45 |         td_error = cls.get_td_error(online_net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask)
46 |         loss = pow(td_error, 2) * weights
47 |         loss = loss.mean()
48 | 
49 |         optimizer.zero_grad()
50 |         loss.backward()
51 |         optimizer.step()
52 | 
53 |         return loss
54 | 
55 |     def get_action(self, input):
56 |         qvalue = self.forward(input)
57 |         _, action = torch.max(qvalue, 1)
58 |         return action.numpy()[0]
59 | 


--------------------------------------------------------------------------------
/rainbow/5-per/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import argparse
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | from model import QNet
 12 | from memory import Memory_With_TDError
 13 | from tensorboardX import SummaryWriter
 14 | from config import env_name, gamma, initial_exploration, batch_size, update_target, log_interval, goal_score, device, replay_memory_capacity, lr, beta_start
 15 | 
 16 | 
 17 | def get_action(state, target_net, epsilon, env):
 18 |     if np.random.rand() <= epsilon:
 19 |         return env.action_space.sample()
 20 |     else:
 21 |         return target_net.get_action(state)
 22 | 
 23 | 
 24 | def update_target_model(online_net, target_net):
 25 |     # Target -> Net
 26 |     target_net.load_state_dict(online_net.state_dict())
 27 | 
 28 | 
 29 | def main():
 30 |     env = gym.make(env_name)
 31 |     env.seed(500)
 32 |     torch.manual_seed(500)
 33 | 
 34 |     num_inputs = env.observation_space.shape[0]
 35 |     num_actions = env.action_space.n
 36 |     print('state size:', num_inputs)
 37 |     print('action size:', num_actions)
 38 | 
 39 |     online_net = QNet(num_inputs, num_actions)
 40 |     target_net = QNet(num_inputs, num_actions)
 41 |     update_target_model(online_net, target_net)
 42 | 
 43 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 44 |     writer = SummaryWriter('logs')
 45 | 
 46 |     online_net.to(device)
 47 |     target_net.to(device)
 48 |     online_net.train()
 49 |     target_net.train()
 50 |     memory = Memory_With_TDError(replay_memory_capacity)
 51 |     running_score = 0
 52 |     epsilon = 1.0
 53 |     steps = 0
 54 |     beta = beta_start
 55 |     loss = 0
 56 | 
 57 |     for e in range(3000):
 58 |         done = False
 59 | 
 60 |         score = 0
 61 |         state = env.reset()
 62 |         state = torch.Tensor(state).to(device)
 63 |         state = state.unsqueeze(0)
 64 | 
 65 |         while not done:
 66 |             steps += 1
 67 |             action = get_action(state, target_net, epsilon, env)
 68 |             next_state, reward, done, _ = env.step(action)
 69 | 
 70 |             next_state = torch.Tensor(next_state)
 71 |             next_state = next_state.unsqueeze(0)
 72 | 
 73 |             mask = 0 if done else 1
 74 |             reward = reward if not done or score == 499 else -1
 75 |             action_one_hot = np.zeros(2)
 76 |             action_one_hot[action] = 1
 77 |             memory.push(state, next_state, action_one_hot, reward, mask)
 78 | 
 79 |             score += reward
 80 |             state = next_state
 81 | 
 82 |             if steps > initial_exploration:
 83 |                 epsilon -= 0.00005
 84 |                 epsilon = max(epsilon, 0.1)
 85 |                 beta += 0.00005
 86 |                 beta = min(1, beta)
 87 | 
 88 |                 batch, weights = memory.sample(batch_size, online_net, target_net, beta)
 89 |                 loss = QNet.train_model(online_net, target_net, optimizer, batch, weights)
 90 | 
 91 |                 if steps % update_target == 0:
 92 |                     update_target_model(online_net, target_net)
 93 | 
 94 |         score = score if score == 500.0 else score + 1
 95 |         running_score = 0.99 * running_score + 0.01 * score
 96 |         if e % log_interval == 0:
 97 |             print('{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'.format(
 98 |                 e, running_score, epsilon, beta))
 99 |             writer.add_scalar('log/score', float(running_score), e)
100 |             writer.add_scalar('log/loss', float(loss), e)
101 | 
102 |         if running_score > goal_score:
103 |             break
104 | 
105 | 
106 | if __name__=="__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/rainbow/6-Nosiy_net/Screenshot2018-11-1616-fd936286-4e40-4962-99ff-1ddd3b7deeb8.36.21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choru-k/Reinforcement-Learning-Pytorch-Cartpole/ecb7b622cfefe825ac95388cceb6752413d90a2a/rainbow/6-Nosiy_net/Screenshot2018-11-1616-fd936286-4e40-4962-99ff-1ddd3b7deeb8.36.21.png


--------------------------------------------------------------------------------
/rainbow/6-Nosiy_net/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | 
15 | sigma_zero = 0.5
16 | 


--------------------------------------------------------------------------------
/rainbow/6-Nosiy_net/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | 
 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 6 | 
 7 | 
 8 | class Memory(object):
 9 |     def __init__(self, capacity):
10 |         self.memory = deque(maxlen=capacity)
11 |         self.capacity = capacity
12 | 
13 |     def push(self, state, next_state, action, reward, mask):
14 |         self.memory.append(Transition(state, next_state, action, reward, mask))
15 | 
16 |     def sample(self, batch_size):
17 |         transitions = random.sample(self.memory, batch_size)
18 |         batch = Transition(*zip(*transitions))
19 |         return batch
20 | 
21 |     def __len__(self):
22 |         return len(self.memory)
23 | 


--------------------------------------------------------------------------------
/rainbow/6-Nosiy_net/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | import math
 6 | from config import gamma, sigma_zero
 7 | 
 8 | class NoisyLinear(nn.Module):
 9 |   def __init__(self, in_features, out_features):
10 |     super(NoisyLinear, self).__init__()
11 |     self.in_features = in_features
12 |     self.out_features = out_features
13 |     self.sigma_zero = sigma_zero
14 |     self.weight_mu = nn.Parameter(torch.empty(out_features, in_features))
15 |     self.weight_sigma = nn.Parameter(torch.empty(out_features, in_features))
16 |     self.register_buffer('weight_epsilon', torch.empty(out_features, in_features))
17 |     self.bias_mu = nn.Parameter(torch.empty(out_features))
18 |     self.bias_sigma = nn.Parameter(torch.empty(out_features))
19 |     self.register_buffer('bias_epsilon', torch.empty(out_features))
20 |     self.reset_parameters()
21 |     self.reset_noise()
22 | 
23 |   def reset_parameters(self):
24 |     mu_range = 1 / math.sqrt(self.in_features)
25 |     self.weight_mu.data.uniform_(-mu_range, mu_range)
26 |     self.weight_sigma.data.fill_(self.sigma_zero / math.sqrt(self.in_features))
27 |     self.bias_mu.data.uniform_(-mu_range, mu_range)
28 |     self.bias_sigma.data.fill_(self.sigma_zero / math.sqrt(self.out_features))
29 | 
30 |   def _scale_noise(self, size):
31 |     x = torch.randn(size)
32 |     return x.sign().mul_(x.abs().sqrt_())
33 | 
34 |   def reset_noise(self):
35 |     epsilon_in = self._scale_noise(self.in_features)
36 |     epsilon_out = self._scale_noise(self.out_features)
37 |     self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
38 |     self.bias_epsilon.copy_(epsilon_out)
39 | 
40 |   def forward(self, input):
41 |     return F.linear(input, self.weight_mu + self.weight_sigma * self.weight_epsilon, self.bias_mu + self.bias_sigma * self.bias_epsilon)
42 | 
43 | 
44 | class QNet(nn.Module):
45 |     def __init__(self, num_inputs, num_outputs):
46 |         super(QNet, self).__init__()
47 |         self.num_inputs = num_inputs
48 |         self.num_outputs = num_outputs
49 | 
50 |         self.fc1 = nn.Linear(num_inputs, 128)
51 |         self.fc2 = NoisyLinear(128, num_outputs)
52 | 
53 |         for m in self.modules():
54 |             if isinstance(m, nn.Linear):
55 |                 nn.init.xavier_uniform(m.weight)
56 | 
57 |     def forward(self, x):
58 |         x = F.relu(self.fc1(x))
59 |         qvalue = self.fc2(x)
60 |         return qvalue
61 | 
62 |     @classmethod
63 |     def train_model(cls, online_net, target_net, optimizer, batch):
64 |         states = torch.stack(batch.state)
65 |         next_states = torch.stack(batch.next_state)
66 |         actions = torch.Tensor(batch.action).float()
67 |         rewards = torch.Tensor(batch.reward)
68 |         masks = torch.Tensor(batch.mask)
69 | 
70 |         pred = online_net(states).squeeze(1)
71 |         next_pred = target_net(next_states).squeeze(1)
72 | 
73 |         pred = torch.sum(pred.mul(actions), dim=1)
74 | 
75 |         target = rewards + masks * gamma * next_pred.max(1)[0]
76 | 
77 |         loss = F.mse_loss(pred, target.detach())
78 |         optimizer.zero_grad()
79 |         loss.backward()
80 |         optimizer.step()
81 |         online_net.reset_noise()
82 | 
83 |         return loss
84 | 
85 |     def get_action(self, input):
86 |         qvalue = self.forward(input)
87 |         _, action = torch.max(qvalue, 1)
88 |         return action.numpy()[0]
89 | 
90 |     def reset_noise(self):
91 |         self.fc2.reset_noise()
92 | 


--------------------------------------------------------------------------------
/rainbow/6-Nosiy_net/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | import torch.nn.functional as F
 10 | from model import QNet
 11 | from memory import Memory
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr
 15 | 
 16 | 
 17 | def get_action(state, target_net, epsilon, env):
 18 |     if np.random.rand() <= epsilon:
 19 |         return env.action_space.sample()
 20 |     else:
 21 |         return target_net.get_action(state)
 22 | 
 23 | def update_target_model(online_net, target_net):
 24 |     # Target <- Net
 25 |     target_net.load_state_dict(online_net.state_dict())
 26 | 
 27 | 
 28 | def main():
 29 |     env = gym.make(env_name)
 30 |     env.seed(500)
 31 |     torch.manual_seed(500)
 32 | 
 33 |     num_inputs = env.observation_space.shape[0]
 34 |     num_actions = env.action_space.n
 35 |     print('state size:', num_inputs)
 36 |     print('action size:', num_actions)
 37 | 
 38 |     online_net = QNet(num_inputs, num_actions)
 39 |     target_net = QNet(num_inputs, num_actions)
 40 |     update_target_model(online_net, target_net)
 41 | 
 42 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 43 |     writer = SummaryWriter('logs')
 44 | 
 45 |     online_net.to(device)
 46 |     target_net.to(device)
 47 |     online_net.train()
 48 |     target_net.train()
 49 |     memory = Memory(replay_memory_capacity)
 50 |     running_score = 0
 51 |     epsilon = 1.0
 52 |     steps = 0
 53 |     loss = 0
 54 | 
 55 |     for e in range(3000):
 56 |         done = False
 57 | 
 58 |         score = 0
 59 |         state = env.reset()
 60 |         state = torch.Tensor(state).to(device)
 61 |         state = state.unsqueeze(0)
 62 | 
 63 |         while not done:
 64 |             steps += 1
 65 | 
 66 |             action = get_action(state, target_net, epsilon, env)
 67 |             next_state, reward, done, _ = env.step(action)
 68 | 
 69 |             next_state = torch.Tensor(next_state)
 70 |             next_state = next_state.unsqueeze(0)
 71 | 
 72 |             mask = 0 if done else 1
 73 |             reward = reward if not done or score == 499 else -1
 74 |             action_one_hot = np.zeros(2)
 75 |             action_one_hot[action] = 1
 76 |             memory.push(state, next_state, action_one_hot, reward, mask)
 77 | 
 78 |             score += reward
 79 |             state = next_state
 80 | 
 81 |             if steps > initial_exploration:
 82 |                 epsilon -= 0.00005
 83 |                 epsilon = max(epsilon, 0.1)
 84 | 
 85 |                 batch = memory.sample(batch_size)
 86 |                 loss = QNet.train_model(online_net, target_net, optimizer, batch)
 87 | 
 88 |                 if steps % update_target == 0:
 89 |                     update_target_model(online_net, target_net)
 90 | 
 91 |         score = score if score == 500.0 else score + 1
 92 |         running_score = 0.99 * running_score + 0.01 * score
 93 |         if e % log_interval == 0:
 94 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
 95 |                 e, running_score, epsilon))
 96 |             writer.add_scalar('log/score', float(running_score), e)
 97 |             writer.add_scalar('log/loss', float(loss), e)
 98 | 
 99 |         if running_score > goal_score:
100 |             break
101 | 
102 | 
103 | if __name__=="__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/rainbow/7-distributional_c51/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.0001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | 
15 | num_support = 8
16 | V_max = 5
17 | V_min = -5
18 | 


--------------------------------------------------------------------------------
/rainbow/7-distributional_c51/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple, deque
 3 | 
 4 | 
 5 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 6 | 
 7 | 
 8 | class Memory(object):
 9 |     def __init__(self, capacity):
10 |         self.memory = deque(maxlen=capacity)
11 |         self.capacity = capacity
12 | 
13 |     def push(self, state, next_state, action, reward, mask):
14 |         self.memory.append(Transition(state, next_state, action, reward, mask))
15 | 
16 |     def sample(self, batch_size):
17 |         transitions = random.sample(self.memory, batch_size)
18 |         batch = Transition(*zip(*transitions))
19 |         return batch
20 | 
21 |     def __len__(self):
22 |         return len(self.memory)
23 | 


--------------------------------------------------------------------------------
/rainbow/7-distributional_c51/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | from config import batch_size, num_support, gamma, V_max, V_min
 7 | 
 8 | 
 9 | class Distributional_C51(nn.Module):
10 |     def __init__(self, num_inputs, num_outputs):
11 |         super(Distributional_C51, self).__init__()
12 |         self.num_inputs = num_inputs
13 |         self.num_outputs = num_outputs
14 | 
15 |         self.dz = float(V_max - V_min) / (num_support - 1)
16 |         self.z = torch.Tensor([V_min + i * self.dz for i in range(num_support)])
17 | 
18 |         self.fc1 = nn.Linear(num_inputs, 128)
19 |         self.fc2 = nn.Linear(128, num_outputs * num_support)
20 | 
21 |         for m in self.modules():
22 |             if isinstance(m, nn.Linear):
23 |                 nn.init.xavier_uniform(m.weight)
24 | 
25 | 
26 |     def forward(self, input):
27 |         x = F.relu(self.fc1(input))
28 |         x = self.fc2(x)
29 |         z = x.view(-1, self.num_outputs, num_support)
30 |         p = nn.Softmax(dim=2)(z)
31 |         return p
32 | 
33 | 
34 |     def get_action(self, input):
35 |         p = self.forward(input)
36 |         p = p.squeeze(0)
37 |         z_space = self.z.repeat(self.num_outputs, 1)
38 |         Q = torch.sum(p * z_space, dim=1)
39 |         action = torch.argmax(Q)
40 |         return action.item()
41 | 
42 |     @classmethod
43 |     def get_m(cls, _rewards, _masks, _prob_next_states_action):
44 |         rewards = _rewards.numpy()
45 |         masks = _masks.numpy()
46 |         prob_next_states_action = _prob_next_states_action.detach().numpy()
47 |         m_prob = np.zeros([batch_size, num_support], dtype=np.float32)
48 | 
49 |         dz = float(V_max - V_min) / (num_support - 1)
50 |         batch_id = range(batch_size)
51 |         for j in range(num_support):
52 |             Tz = np.clip(rewards + masks * gamma * (V_min + j * dz), V_min, V_max)
53 |             bj = (Tz - V_min) / dz
54 | 
55 |             lj = np.floor(bj).astype(np.int64)
56 |             uj = np.ceil(bj).astype(np.int64)
57 | 
58 |             blj = (bj - lj)
59 |             buj = (uj - bj)
60 | 
61 |             m_prob[batch_id, lj[batch_id]] += ((1 - masks) + masks * (prob_next_states_action[batch_id, j])) * buj[batch_id]
62 |             m_prob[batch_id, uj[batch_id]] += ((1 - masks) + masks * (prob_next_states_action[batch_id, j])) * blj[batch_id]
63 | 
64 |         return m_prob
65 | 
66 | 
67 |     @classmethod
68 |     def train_model(cls, online_net, target_net, optimizer, batch):
69 |         states = torch.stack(batch.state)
70 |         next_states = torch.stack(batch.next_state)
71 |         actions = torch.Tensor(batch.action).int()
72 |         rewards = torch.Tensor(batch.reward)
73 |         masks = torch.Tensor(batch.mask)
74 | 
75 |         z_space = online_net.z.repeat(batch_size, online_net.num_outputs, 1)
76 |         prob_next_states = target_net(next_states)
77 |         Q_next_state = torch.sum(prob_next_states * z_space, 2)
78 |         next_actions = torch.argmax(Q_next_state, 1)
79 |         prob_next_states_action = torch.stack([prob_next_states[i, action, :] for i, action in enumerate(next_actions)])
80 | 
81 |         m_prob = cls.get_m(rewards, masks, prob_next_states_action)
82 |         m_prob = torch.tensor(m_prob)
83 | 
84 |         m_prob = (m_prob / torch.sum(m_prob, dim=1, keepdim=True)).detach()
85 |         expand_dim_action = torch.unsqueeze(actions, -1)
86 |         p = torch.sum(online_net(states) * expand_dim_action.float(), dim=1)
87 |         loss = -torch.sum(m_prob * torch.log(p + 1e-20), 1)
88 |         loss = loss.mean()
89 | 
90 |         optimizer.zero_grad()
91 |         loss.backward()
92 |         optimizer.step()
93 | 
94 |         return loss
95 | 


--------------------------------------------------------------------------------
/rainbow/7-distributional_c51/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import argparse
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | from tensorboardX import SummaryWriter
 12 | 
 13 | from model import Distributional_C51
 14 | from memory import Memory
 15 | 
 16 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr
 17 | 
 18 | 
 19 | def get_action(state, target_net, epsilon, env):
 20 |     if np.random.rand() <= epsilon:
 21 |         return env.action_space.sample()
 22 |     else:
 23 |         return target_net.get_action(state)
 24 | 
 25 | def update_target_model(online_net, target_net):
 26 |     # Target <- Net
 27 |     target_net.load_state_dict(online_net.state_dict())
 28 | 
 29 | 
 30 | 
 31 | def main():
 32 |     env = gym.make(env_name)
 33 |     env.seed(500)
 34 |     torch.manual_seed(500)
 35 | 
 36 |     num_inputs = env.observation_space.shape[0]
 37 |     num_actions = env.action_space.n
 38 |     print('state size:', num_inputs)
 39 |     print('action size:', num_actions)
 40 | 
 41 |     online_net = Distributional_C51(num_inputs, num_actions)
 42 |     target_net = Distributional_C51(num_inputs, num_actions)
 43 |     update_target_model(online_net, target_net)
 44 | 
 45 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 46 |     writer = SummaryWriter('logs')
 47 | 
 48 |     online_net.to(device)
 49 |     target_net.to(device)
 50 |     online_net.train()
 51 |     target_net.train()
 52 |     memory = Memory(replay_memory_capacity)
 53 |     running_score = 0
 54 |     epsilon = 1.0
 55 |     steps = 0
 56 |     loss = 0
 57 | 
 58 |     for e in range(3000):
 59 |         done = False
 60 | 
 61 |         score = 0
 62 |         state = env.reset()
 63 |         state = torch.Tensor(state)
 64 |         state = state.unsqueeze(0)
 65 | 
 66 |         while not done:
 67 |             steps += 1
 68 |             action = get_action(state, target_net, epsilon, env)
 69 |             next_state, reward, done, _ = env.step(action)
 70 | 
 71 |             next_state = torch.Tensor(next_state)
 72 |             next_state = next_state.unsqueeze(0)
 73 | 
 74 |             mask = 0 if done else 1
 75 |             reward = reward if not done or score == 499 else -1
 76 |             action_one_hot = np.zeros(2)
 77 |             action_one_hot[action] = 1
 78 |             memory.push(state, next_state, action_one_hot, reward, mask)
 79 | 
 80 |             score += reward
 81 |             state = next_state
 82 | 
 83 |             if steps > initial_exploration:
 84 |                 epsilon -= 0.00005
 85 |                 epsilon = max(epsilon, 0.1)
 86 | 
 87 |                 batch = memory.sample(batch_size)
 88 |                 loss = Distributional_C51.train_model(online_net, target_net, optimizer, batch)
 89 | 
 90 |                 if steps % update_target == 0:
 91 |                     update_target_model(online_net, target_net)
 92 | 
 93 |         score = score if score == 500.0 else score + 1
 94 |         running_score = 0.99 * running_score + 0.01 * score
 95 |         if e % log_interval == 0:
 96 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
 97 |                 e, running_score, epsilon))
 98 |             writer.add_scalar('log/score', float(running_score), e)
 99 |             writer.add_scalar('log/loss', float(loss), e)
100 | 
101 |         if running_score > goal_score:
102 |             break
103 | 
104 | if __name__=="__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/rainbow/8-Not_Distributional/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | # Multi_Step
15 | n_step = 3
16 | 
17 | # PER
18 | small_epsilon = 0.0001
19 | alpha = 0.5
20 | beta_start = 0.1
21 | 
22 | # Noisy Net
23 | sigma_zero = 0.5
24 | 


--------------------------------------------------------------------------------
/rainbow/8-Not_Distributional/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | from collections import namedtuple, deque
 4 | import torch
 5 | from model import QNet
 6 | from config import small_epsilon, gamma, alpha, device, n_step
 7 | 
 8 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 9 | 
10 | 
11 | class Memory_With_TDError(object):
12 |     def __init__(self, capacity):
13 |         self.memory = []
14 |         self.memory_probabiliy = []
15 |         self.capacity = capacity
16 |         self.position = 0
17 |         self.reset_local()
18 | 
19 |     def reset_local(self):
20 |         self.local_step = 0
21 |         self.local_state = None
22 |         self.local_action = None
23 |         self.local_rewards = []
24 | 
25 |     def push(self, state, next_state, action, reward, mask):
26 |         self.local_step += 1
27 |         self.local_rewards.append(reward)
28 |         if self.local_step == 1:
29 |             self.local_state = state
30 |             self.local_action = action
31 |         if self.local_step == n_step:
32 |             reward = 0
33 |             for idx, local_reward in enumerate(self.local_rewards):
34 |                 reward += (gamma ** idx) * local_reward
35 |             self.push_to_memory(self.local_state, next_state, self.local_action, reward, mask)
36 |             self.reset_local()
37 |         if mask == 0:
38 |             self.reset_local()
39 | 
40 | 
41 |     def push_to_memory(self, state, next_state, action, reward, mask):
42 |         if len(self.memory) > 0:
43 |             max_probability = max(self.memory_probabiliy)
44 |         else:
45 |             max_probability = small_epsilon
46 | 
47 |         if len(self.memory) < self.capacity:
48 |             self.memory.append(Transition(state, next_state, action, reward, mask))
49 |             self.memory_probabiliy.append(max_probability)
50 |         else:
51 |             self.memory[self.position] = Transition(state, next_state, action, reward, mask)
52 |             self.memory_probabiliy[self.position] = max_probability
53 | 
54 |         self.position = (self.position + 1) % self.capacity
55 | 
56 |     def sample(self, batch_size, net, target_net, beta):
57 |         probability_sum = sum(self.memory_probabiliy)
58 |         p = [probability / probability_sum for probability in self.memory_probabiliy]
59 | 
60 |         indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p)
61 |         transitions = [self.memory[idx] for idx in indexes]
62 |         transitions_p = [p[idx] for idx in indexes]
63 |         batch = Transition(*zip(*transitions))
64 | 
65 |         weights = [pow(self.capacity * p_j, -beta) for p_j in transitions_p]
66 |         weights = torch.Tensor(weights).to(device)
67 |         weights = weights / weights.max()
68 | 
69 | 
70 |         td_error = QNet.get_td_error(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask)
71 |         td_error = td_error.detach()
72 | 
73 |         td_error_idx = 0
74 |         for idx in indexes:
75 |             self.memory_probabiliy[idx] = pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item()
76 |             # print(pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item())
77 |             td_error_idx += 1
78 | 
79 | 
80 |         return batch, weights
81 | 
82 |     def __len__(self):
83 |         return len(self.memory)
84 | 


--------------------------------------------------------------------------------
/rainbow/8-Not_Distributional/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import math
  5 | 
  6 | from config import gamma, sigma_zero, n_step
  7 | 
  8 | class NoisyLinear(nn.Module):
  9 |   def __init__(self, in_features, out_features):
 10 |     super(NoisyLinear, self).__init__()
 11 |     self.in_features = in_features
 12 |     self.out_features = out_features
 13 |     self.sigma_zero = sigma_zero
 14 |     self.weight_mu = nn.Parameter(torch.empty(out_features, in_features))
 15 |     self.weight_sigma = nn.Parameter(torch.empty(out_features, in_features))
 16 |     self.register_buffer('weight_epsilon', torch.empty(out_features, in_features))
 17 |     self.bias_mu = nn.Parameter(torch.empty(out_features))
 18 |     self.bias_sigma = nn.Parameter(torch.empty(out_features))
 19 |     self.register_buffer('bias_epsilon', torch.empty(out_features))
 20 |     self.reset_parameters()
 21 |     self.reset_noise()
 22 | 
 23 |   def reset_parameters(self):
 24 |     mu_range = 1 / math.sqrt(self.in_features)
 25 |     self.weight_mu.data.uniform_(-mu_range, mu_range)
 26 |     self.weight_sigma.data.fill_(self.sigma_zero / math.sqrt(self.in_features))
 27 |     self.bias_mu.data.uniform_(-mu_range, mu_range)
 28 |     self.bias_sigma.data.fill_(self.sigma_zero / math.sqrt(self.out_features))
 29 | 
 30 |   def _scale_noise(self, size):
 31 |     x = torch.randn(size)
 32 |     return x.sign().mul_(x.abs().sqrt_())
 33 | 
 34 |   def reset_noise(self):
 35 |     epsilon_in = self._scale_noise(self.in_features)
 36 |     epsilon_out = self._scale_noise(self.out_features)
 37 |     self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
 38 |     self.bias_epsilon.copy_(epsilon_out)
 39 | 
 40 |   def forward(self, input):
 41 |     return F.linear(input, self.weight_mu + self.weight_sigma * self.weight_epsilon, self.bias_mu + self.bias_sigma * self.bias_epsilon)
 42 | 
 43 | 
 44 | class QNet(nn.Module):
 45 |     def __init__(self, num_inputs, num_outputs):
 46 |         super(QNet, self).__init__()
 47 |         self.num_inputs = num_inputs
 48 |         self.num_outputs = num_outputs
 49 | 
 50 |         self.fc = nn.Linear(num_inputs, 128)
 51 |         self.fc_adv = NoisyLinear(128, num_outputs)
 52 |         self.fc_val = nn.Linear(128, 1)
 53 | 
 54 | 
 55 |         for m in self.modules():
 56 |             if isinstance(m, nn.Linear):
 57 |                 nn.init.xavier_uniform(m.weight)
 58 | 
 59 |     def forward(self, x):
 60 |         x = F.relu(self.fc(x))
 61 |         adv = self.fc_adv(x)
 62 |         adv = adv.view(-1, self.num_outputs)
 63 |         val = self.fc_val(x)
 64 |         val = val.view(-1, 1)
 65 | 
 66 | 
 67 |         qvalue = val + (adv - adv.mean(dim=1, keepdim=True))
 68 |         # (batch, action) = (batch) + ((batch, action) - (batch))
 69 |         return qvalue
 70 | 
 71 |     @classmethod
 72 |     def get_td_error(cls, online_net, target_net, states, next_states, actions, rewards, masks):
 73 |         states = torch.stack(states)
 74 |         next_states = torch.stack(next_states)
 75 |         actions = torch.Tensor(actions)
 76 |         rewards = torch.Tensor(rewards)
 77 |         masks = torch.Tensor(masks)
 78 | 
 79 |         pred = online_net(states)
 80 |         qvalue = online_net(next_states)
 81 | 
 82 |         _, action_from_online_net = online_net(next_states).max(1)
 83 | 
 84 |         target_net.reset_noise()
 85 |         next_pred = target_net(next_states).squeeze(1)
 86 | 
 87 |         pred = torch.sum(pred.mul(actions), dim=1)
 88 | 
 89 |         target = rewards + masks * (gamma ** n_step) * next_pred.gather(1, action_from_online_net.unsqueeze(1)).squeeze(1)
 90 | 
 91 |         td_error = pred - target.detach()
 92 | 
 93 |         return td_error
 94 | 
 95 |     @classmethod
 96 |     def train_model(cls, online_net, target_net, optimizer, batch, weights):
 97 |         td_error = cls.get_td_error(online_net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask)
 98 | 
 99 |         loss = pow(td_error, 2) * weights
100 |         loss = loss.mean()
101 | 
102 |         optimizer.zero_grad()
103 |         loss.backward()
104 |         optimizer.step()
105 | 
106 |         return loss
107 | 
108 |     def get_action(self, input):
109 |         self.reset_noise()
110 |         qvalue = self.forward(input)
111 |         _, action = torch.max(qvalue, 1)
112 |         return action.numpy()[0]
113 | 
114 |     def reset_noise(self):
115 |         self.fc_adv.reset_noise()
116 | 


--------------------------------------------------------------------------------
/rainbow/8-Not_Distributional/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import gym
 4 | import random
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.optim as optim
 9 | import torch.nn.functional as F
10 | from model import QNet
11 | from memory import Memory_With_TDError
12 | from tensorboardX import SummaryWriter
13 | 
14 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, beta_start
15 | 
16 | def update_target_model(online_net, target_net):
17 |     # Target <- Net
18 |     target_net.load_state_dict(online_net.state_dict())
19 | 
20 | 
21 | def main():
22 |     env = gym.make(env_name)
23 |     env.seed(500)
24 |     torch.manual_seed(500)
25 | 
26 |     num_inputs = env.observation_space.shape[0]
27 |     num_actions = env.action_space.n
28 |     print('state size:', num_inputs)
29 |     print('action size:', num_actions)
30 | 
31 |     online_net = QNet(num_inputs, num_actions)
32 |     target_net = QNet(num_inputs, num_actions)
33 |     update_target_model(online_net, target_net)
34 | 
35 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
36 |     writer = SummaryWriter('logs')
37 | 
38 |     online_net.to(device)
39 |     target_net.to(device)
40 |     online_net.train()
41 |     target_net.train()
42 |     memory = Memory_With_TDError(replay_memory_capacity)
43 |     running_score = 0
44 |     steps = 0
45 |     beta = beta_start
46 |     loss = 0
47 | 
48 |     for e in range(3000):
49 |         done = False
50 | 
51 |         score = 0
52 |         state = env.reset()
53 |         state = torch.Tensor(state).to(device)
54 |         state = state.unsqueeze(0)
55 | 
56 |         while not done:
57 |             steps += 1
58 | 
59 |             action = target_net.get_action(state)
60 |             next_state, reward, done, _ = env.step(action)
61 | 
62 |             next_state = torch.Tensor(next_state)
63 |             next_state = next_state.unsqueeze(0)
64 | 
65 |             mask = 0 if done else 1
66 |             reward = reward if not done or score == 499 else -1
67 |             action_one_hot = np.zeros(2)
68 |             action_one_hot[action] = 1
69 |             memory.push(state, next_state, action_one_hot, reward, mask)
70 | 
71 |             score += reward
72 |             state = next_state
73 | 
74 |             if steps > initial_exploration:
75 |                 beta += 0.00005
76 |                 beta = min(1, beta)
77 | 
78 |                 batch, weights = memory.sample(batch_size, online_net, target_net, beta)
79 |                 loss = QNet.train_model(online_net, target_net, optimizer, batch, weights)
80 | 
81 |                 if steps % update_target == 0:
82 |                     update_target_model(online_net, target_net)
83 | 
84 |         score = score if score == 500.0 else score + 1
85 |         running_score = 0.99 * running_score + 0.01 * score
86 |         if e % log_interval == 0:
87 |             print('{} episode | score: {:.2f} |  beta: {:.2f}'.format(
88 |                 e, running_score, beta))
89 |             writer.add_scalar('log/score', float(running_score), e)
90 |             writer.add_scalar('log/loss', float(loss), e)
91 | 
92 |         if running_score > goal_score:
93 |             break
94 | 
95 | 
96 | if __name__=="__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/rainbow/9-Rainbow/README-KR.md:
--------------------------------------------------------------------------------
 1 | # Rainbow
 2 | 
 3 | Last Edited: Nov 19, 2018 8:32 PM
 4 | 
 5 | ## Duel + Distributional
 6 | 
 7 | 일단 지금까지에서 Q는 밑으로 정의하고
 8 | 
 9 | $$Q(s,a) = R(s,a) + \gamma \ argmax_{a'}Q(s',a')$$
10 | 
11 | Duel 에서 밑의 식을 사용하였습니다.
12 | 
13 | $$Q(s,a) = V(s) + A(s,a)$$
14 | 
15 | 그리고 Distributional 에서는
16 | 
17 | $$Z(s,a)=R(s,a)+ \gamma \ Z(s',a')$$
18 | 
19 | 입니다. 여기서 Z 는 확률분포 입니다. 이 두 개를 같이 적용 한다면
20 | 
21 | $$Z(s,a) = V(s) + A(s,a)$$
22 | 
23 | V와 A 을 확률 분포로써 적용 할 수 있을 것 같습니다. V 는 `(num_support)` 의 차원을 갖고 A 는 `(action_space, num_support)` 의 차원을 가질 것입니다.
24 | 
25 | ```python
26 | def forward(self, x):
27 |         x = F.relu(self.fc(x))
28 |         adv = self.fc_adv(x)
29 |         val = self.fc_val(x)
30 | 
31 |         val = val.view(-1, 1, num_support)
32 |         adv = adv.view(-1, self.num_outputs, num_support)
33 |         z = val + (adv - adv.mean(1, keepdim=True))
34 |         z = z.view(-1, self.num_outputs, num_support)
35 |         p = nn.Softmax(dim=2)(z)
36 |         return p
37 | ```
38 | 
39 | ## Double + Distributional
40 | 
41 | oneline_net 에서 action을 구합니다.
42 | 
43 | ```python
44 |         z_space = online_net.z.repeat(batch_size, online_net.num_outputs, 1)
45 |         prob_next_states_online = online_net(next_states)
46 |         prob_next_states_target = target_net(next_states)
47 |         Q_next_state = torch.sum(prob_next_states_online * z_space, 2)
48 |         next_actions = torch.argmax(Q_next_state, 1)
49 |         prob_next_states_action = torch.stack([prob_next_states_target[i, action, :] for i, action in enumerate(next_actions)])
50 | 
51 | ```
52 | 
53 | ## PER + Distributional
54 | 
55 | 그 전의 PER 에서는 td_error 의 절댓 값의 비율이 경험의 중요성이 되었습니다. 
56 | 
57 | 여기서는 Distributional 의 Loss값을 경험의 중요성이라고 고려합니다.
58 | 
59 | ```python
60 | td_error = QNet.get_loss(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask)
61 | ```
62 | 
63 | 
64 | 
65 | ```python
66 | 
67 | @classmethod
68 |     def get_loss(cls, oneline_net, target_net, states, next_states, actions, rewards, masks):
69 |         states = torch.stack(states)
70 |         next_states = torch.stack(next_states)
71 |         actions = torch.Tensor(actions).int()
72 |         rewards = torch.Tensor(rewards)
73 |         masks = torch.Tensor(masks)
74 | 
75 |         z_space = oneline_net.z.repeat(batch_size, oneline_net.num_outputs, 1)
76 |         prob_next_states = oneline_net(next_states)
77 |         Q_next_state = torch.sum(prob_next_states * z_space, 2)
78 |         next_actions = torch.argmax(Q_next_state, 1)
79 |         prob_next_states_action = torch.stack([prob_next_states[i, action, :] for i, action in enumerate(next_actions)])
80 | 
81 |         m_prob = cls.get_m(rewards, masks, prob_next_states_action)
82 |         m_prob = torch.tensor(m_prob)
83 | 
84 |         m_prob = m_prob / torch.sum(m_prob, dim=1, keepdim=True)
85 |         expand_dim_action = torch.unsqueeze(actions, -1)
86 |         p = torch.sum(oneline_net(states) * expand_dim_action.float(), dim=1)
87 |         loss = -torch.sum(m_prob * torch.log(p + 1e-20), 1)
88 | 
89 |         return loss
90 | ```
91 | 
92 | 나머지는 Distributional 이 없는 버전과 동일합니다.
93 | 
94 | 


--------------------------------------------------------------------------------
/rainbow/9-Rainbow/config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | env_name = 'CartPole-v1'
 4 | gamma = 0.99
 5 | batch_size = 32
 6 | lr = 0.0001
 7 | initial_exploration = 1000
 8 | goal_score = 200
 9 | log_interval = 10
10 | update_target = 100
11 | replay_memory_capacity = 1000
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 | # Multi_Step
15 | n_step = 1
16 | 
17 | # PER
18 | small_epsilon = 0.0001
19 | alpha = 1
20 | beta_start = 0.1
21 | 
22 | # Noisy Net
23 | sigma_zero = 0.5
24 | 
25 | # Distributional
26 | num_support = 8
27 | V_max = 5
28 | V_min = -5
29 | 


--------------------------------------------------------------------------------
/rainbow/9-Rainbow/memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import torch
 4 | from collections import namedtuple, deque
 5 | from model import QNet
 6 | from config import small_epsilon, gamma, alpha, device, n_step
 7 | 
 8 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 9 | 
10 | 
11 | class Memory(object):
12 |     def __init__(self, capacity):
13 |         self.memory = []
14 |         self.memory_probabiliy = []
15 |         self.capacity = capacity
16 |         self.position = 0
17 |         self.reset_local()
18 | 
19 |     def reset_local(self):
20 |         self.local_step = 0
21 |         self.local_state = None
22 |         self.local_action = None
23 |         self.local_rewards = []
24 | 
25 |     def push(self, state, next_state, action, reward, mask):
26 |         self.local_step += 1
27 |         self.local_rewards.append(reward)
28 |         if self.local_step == 1:
29 |             self.local_state = state
30 |             self.local_action = action
31 |         if self.local_step == n_step:
32 |             reward = 0
33 |             for idx, local_reward in enumerate(self.local_rewards):
34 |                 reward += (gamma ** idx) * local_reward
35 |             self.push_to_memory(self.local_state, next_state, self.local_action, reward, mask)
36 |             self.reset_local()
37 |         if mask == 0:
38 |             self.reset_local()
39 | 
40 | 
41 |     def push_to_memory(self, state, next_state, action, reward, mask):
42 |         if len(self.memory) > 0:
43 |             max_probability = max(self.memory_probabiliy)
44 |         else:
45 |             max_probability = small_epsilon
46 | 
47 |         if len(self.memory) < self.capacity:
48 |             self.memory.append(Transition(state, next_state, action, reward, mask))
49 |             self.memory_probabiliy.append(max_probability)
50 |         else:
51 |             self.memory[self.position] = Transition(state, next_state, action, reward, mask)
52 |             self.memory_probabiliy[self.position] = max_probability
53 | 
54 |         self.position = (self.position + 1) % self.capacity
55 | 
56 |     def sample(self, batch_size, net, target_net, beta):
57 |         probability_sum = sum(self.memory_probabiliy)
58 |         p = [probability / probability_sum for probability in self.memory_probabiliy]
59 | 
60 |         indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p)
61 |         transitions = [self.memory[idx] for idx in indexes]
62 |         transitions_p = [p[idx] for idx in indexes]
63 |         batch = Transition(*zip(*transitions))
64 | 
65 |         weights = [pow(self.capacity * p_j, -beta) for p_j in transitions_p]
66 |         weights = torch.Tensor(weights).to(device)
67 |         weights = weights / weights.max()
68 | 
69 |         td_error = QNet.get_loss(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask)
70 |         td_error = td_error.detach()
71 | 
72 |         td_error_idx = 0
73 |         for idx in indexes:
74 |             self.memory_probabiliy[idx] = pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item()
75 |             td_error_idx += 1
76 | 
77 | 
78 |         return batch, weights
79 | 
80 |     def __len__(self):
81 |         return len(self.memory)
82 | 


--------------------------------------------------------------------------------
/rainbow/9-Rainbow/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import gym
  4 | import random
  5 | import argparse
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | from tensorboardX import SummaryWriter
 12 | 
 13 | from model import QNet
 14 | from memory import Memory
 15 | 
 16 | from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr, beta_start
 17 | 
 18 | 
 19 | def get_action(state, target_net, epsilon, env):
 20 |     if np.random.rand() <= epsilon:
 21 |         return env.action_space.sample()
 22 |     else:
 23 |         return target_net.get_action(state)
 24 | 
 25 | def update_target_model(online_net, target_net):
 26 |     # Target <- Net
 27 |     target_net.load_state_dict(online_net.state_dict())
 28 | 
 29 | 
 30 | 
 31 | def main():
 32 |     env = gym.make(env_name)
 33 |     env.seed(500)
 34 |     torch.manual_seed(500)
 35 | 
 36 |     num_inputs = env.observation_space.shape[0]
 37 |     num_actions = env.action_space.n
 38 |     print('state size:', num_inputs)
 39 |     print('action size:', num_actions)
 40 | 
 41 |     online_net = QNet(num_inputs, num_actions)
 42 |     target_net = QNet(num_inputs, num_actions)
 43 |     update_target_model(online_net, target_net)
 44 | 
 45 |     optimizer = optim.Adam(online_net.parameters(), lr=lr)
 46 |     writer = SummaryWriter('logs')
 47 | 
 48 |     online_net.to(device)
 49 |     target_net.to(device)
 50 |     online_net.train()
 51 |     target_net.train()
 52 |     memory = Memory(replay_memory_capacity)
 53 |     running_score = 0
 54 |     epsilon = 1.0
 55 |     steps = 0
 56 |     beta = beta_start
 57 |     loss = 0
 58 | 
 59 |     for e in range(3000):
 60 |         done = False
 61 | 
 62 |         score = 0
 63 |         state = env.reset()
 64 |         state = torch.Tensor(state)
 65 |         state = state.unsqueeze(0)
 66 | 
 67 |         while not done:
 68 |             steps += 1
 69 |             action = get_action(state, target_net, epsilon, env)
 70 |             next_state, reward, done, _ = env.step(action)
 71 | 
 72 |             next_state = torch.Tensor(next_state)
 73 |             next_state = next_state.unsqueeze(0)
 74 | 
 75 |             mask = 0 if done else 1
 76 |             reward = reward if not done or score == 499 else -1
 77 |             action_one_hot = np.zeros(2)
 78 |             action_one_hot[action] = 1
 79 |             memory.push(state, next_state, action_one_hot, reward, mask)
 80 | 
 81 |             score += reward
 82 |             state = next_state
 83 | 
 84 |             if steps > initial_exploration:
 85 |                 epsilon -= 0.00005
 86 |                 epsilon = max(epsilon, 0.1)
 87 |                 beta += 0.00005
 88 |                 beta = min(1, beta)
 89 | 
 90 |                 batch, weights = memory.sample(batch_size, online_net, target_net, beta)
 91 |                 loss = QNet.train_model(online_net, target_net, optimizer, batch, weights)
 92 | 
 93 |                 if steps % update_target == 0:
 94 |                     update_target_model(online_net, target_net)
 95 | 
 96 |         score = score if score == 500.0 else score + 1
 97 |         running_score = 0.99 * running_score + 0.01 * score
 98 |         if e % log_interval == 0:
 99 |             print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
100 |                 e, running_score, epsilon))
101 |             writer.add_scalar('log/score', float(running_score), e)
102 |             writer.add_scalar('log/loss', float(loss), e)
103 | 
104 |         if running_score > goal_score:
105 |             break
106 | 
107 | if __name__=="__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------