├── .gitignore
├── README.md
├── requirements.txt
├── rltorch
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   ├── core.py
    │   ├── ppo.py
    │   └── random.py
    ├── env.py
    ├── layers.py
    ├── memories.py
    ├── processors
    │   ├── __init__.py
    │   ├── atari.py
    │   └── core.py
    └── runner.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.com
 4 | *.class
 5 | *.dll
 6 | *.exe
 7 | *.o
 8 | *.so
 9 | *.pyc
10 | *.swp
11 | 
12 | # Packages #
13 | ############
14 | # it's better to unpack these files and commit the raw source
15 | # git has its own built in compression methods
16 | *.7z
17 | *.dmg
18 | *.gz
19 | *.iso
20 | *.jar
21 | *.rar
22 | *.tar
23 | *.zip
24 | *.meta
25 | *.index
26 | *.ckpt*
27 | # Logs and databases #
28 | ######################
29 | .ipynb_checkpoints
30 | *.log
31 | *.sql
32 | *.sqlite
33 | /*.egg-info
34 | 
35 | # OS generated files #
36 | ######################
37 | .DS_Store
38 | .DS_Store?
39 | ._*
40 | .Spotlight-V100
41 | .Trashes
42 | ehthumbs.db
43 | Thumbs.db
44 | dist
45 | build
46 | build/*
47 | .idea
48 | params
49 | */data/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PPO PyTorch
 2 | Implementation of PPO with PyTorch
 3 | 
 4 | # Installation
 5 | To use this repository, you need to install through `setup.py`.
 6 | ```buildoutcfg
 7 | python setup.py install
 8 | ```
 9 | 
10 | After installation, you can use the file with `import rltorch`.
11 | 
12 | # Examples
13 | ```python
14 | import gym
15 | 
16 | from rltorch import Runner
17 | from rltorch.agents import PPOAgent
18 | from rltorch.processors import AtariProcessor
19 | 
20 | env = gym.make('Breakout-v0').unwrapped
21 | 
22 | FRAME_WIDTH = 84
23 | FRAME_HEIGHT = 84
24 | WINDOW_LENGTH = 4
25 | # state_shape = env.observation_space.shape
26 | state_shape = (WINDOW_LENGTH, FRAME_WIDTH, FRAME_HEIGHT)
27 | action_config = {'n_action': env.action_space.n, 'type': 'integer'}
28 | processor = AtariProcessor(FRAME_WIDTH, FRAME_HEIGHT)
29 | 
30 | # Define agent
31 | agent = PPOAgent(state_shape, action_config, processor=processor,
32 |                  window_length=WINDOW_LENGTH, n_epochs=5,
33 |                  lr=2.5e-4, entropy_coef=0.01, value_loss_coef=1,
34 |                  num_frames_per_proc=128)
35 | 
36 | # Define execution
37 | runner = Runner(env, agent, num_workers=4, multi=True)
38 | 
39 | # Start running
40 | optimzeid_agent = runner.simulate(training=True, notebook=True, render_freq=4)
41 | 
42 | ```
43 | 
44 | # Refrences
45 | ### Implementation
46 | * [pytorch-a3c](https://github.com/ikostrikov/pytorch-a3c)
47 | * [baselines](https://github.com/openai/baselines)
48 | 
49 | ### Theory
50 | * [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf)
51 | * [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf)
52 | * [HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION](https://arxiv.org/pdf/1506.02438.pdf)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gym
2 | tensorboardX
3 | torch
4 | torchvision


--------------------------------------------------------------------------------
/rltorch/__init__.py:
--------------------------------------------------------------------------------
1 | from .runner import Runner
2 | 


--------------------------------------------------------------------------------
/rltorch/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from .random import RandomAgent
2 | from .ppo import PPOAgent


--------------------------------------------------------------------------------
/rltorch/agents/core.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | import os
  3 | import shutil
  4 | from abc import abstractmethod
  5 | from itertools import chain
  6 | from collections import defaultdict, deque
  7 | from functools import partial
  8 | 
  9 | import numpy as np
 10 | from tensorboardX import SummaryWriter
 11 | import torch
 12 | import torch.optim as optim
 13 | 
 14 | from ..memories import ACMemory
 15 | 
 16 | 
 17 | class BaseAgent(ABC):
 18 |     """Abstract class for Agent
 19 | 
 20 |     You need to inherit this class for implementing more detail
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     state_shape: array-like
 25 |         The shape of input
 26 |     action_config: dict
 27 |         Configuration of action space, which may include type, shape, etc.
 28 |     processor: class
 29 |         Processor instance to transform input to adjust format of models' input
 30 |     smooth_length: int
 31 |         The length to smooth data before recording
 32 |     log_dir: str
 33 |         Directory to store the recrod for tensorboard
 34 |     """
 35 |     def __init__(self, state_shape, action_config, processor=None, reward_reshape=None,
 36 |                  smooth_length=100, log_dir='./logs'):
 37 |         super(BaseAgent, self).__init__()
 38 |         self.state_shape = state_shape
 39 |         self.action_config = action_config
 40 |         self.processor = processor
 41 |         self.reward_reshape = reward_reshape
 42 |         self.smooth_length = smooth_length
 43 |         # Delete old logs if any
 44 |         if os.path.isdir(log_dir):
 45 |             print('Delete old tensorboard log')
 46 |             shutil.rmtree(log_dir)
 47 |         self.writer = SummaryWriter(log_dir)
 48 |         self.record_step = 0
 49 |         self.episode_step = 0
 50 | 
 51 |     @abstractmethod
 52 |     def predict(self, *args, **kwrags):
 53 |         raise NotImplementedError
 54 | 
 55 |     def record(self, *args, **kwargs):
 56 |         pass
 57 | 
 58 |     def observe(self, *args, **kwargs):
 59 |         pass
 60 | 
 61 |     def fit(self, *args, **kwargs):
 62 |         pass
 63 | 
 64 | 
 65 | class ACAgent(BaseAgent):
 66 |     """Actor Critic Base Agent
 67 | 
 68 |     sParameters
 69 |     ----------
 70 |     env: gym.Env
 71 |         OpenAI Gym Environment
 72 |     state_shape: array-like
 73 |         The shape of input
 74 |     action_config: dict
 75 |         Configuration of action space, which may include type, shape, etc.
 76 |     processor: class
 77 |         Processor instance to transform input to adjust format of models' input
 78 |     smooth_length: int
 79 |         The length to smooth data before recording
 80 |     log_dir: str
 81 |         Directory to store the recrod for tensorboard
 82 |     window_length: int
 83 |     lr: float
 84 |     critic_config: dict
 85 |     actor_config: dict
 86 |     action_dist: torch.distributions object
 87 |         The distribution for action
 88 |     discount: float
 89 |         Discount Factor
 90 |     gae_lambda: float
 91 |         GAE parameter for trace eligibility
 92 |     num_frames_per_proc: int
 93 |     batch_size: int
 94 |     entropy_coef: float
 95 |     value_loss_coef: float
 96 |     max_grad_nrom: float
 97 |     """
 98 | 
 99 |     def __init__(self, state_shape, action_config, processor,
100 |                  reward_reshape, smooth_length, log_dir,
101 |                  window_length, lr, model_config,
102 |                  action_dist, discount, gae_lambda, num_frames_per_proc,
103 |                  batch_size,
104 |                  entropy_coef, value_loss_coef, max_grad_norm):
105 |         super(ACAgent, self).__init__(state_shape, action_config, processor,
106 |                                       reward_reshape,
107 |                                       smooth_length, log_dir)
108 |         self.widow_length = window_length
109 |         self.action_dist = action_dist
110 |         self.discount = discount
111 |         self.gae_lambda = gae_lambda
112 |         self.num_frames_per_proc = num_frames_per_proc
113 |         self.batch_size = batch_size
114 |         self.entropy_coef = entropy_coef
115 |         self.value_loss_coef = value_loss_coef
116 |         self.max_grad_norm = max_grad_norm
117 |         # Multi Agent Memory
118 |         self.memory = ACMemory(num_frames_per_proc, window_length)
119 |         # Build Network
120 |         self.ac_model = self.build_model(model_config)
121 |         # Build optimizer
122 |         self.parameters = self.ac_model.parameters()
123 |         self.optimizer = optim.Adam(self.parameters, lr=lr)
124 |         # Set device
125 |         self.device = torch.device(
126 |             'cuda' if torch.cuda.is_available() else 'cpu')
127 |         # Record parameters
128 |         self.episode_steps = defaultdict(lambda: 0)
129 |         mydeque = partial(deque, maxlen=self.smooth_length)
130 |         self.reward_record = defaultdict(mydeque)
131 |         self.loss_record = deque(maxlen=self.smooth_length)
132 |         self.actor_loss_record = deque(maxlen=self.smooth_length)
133 |         self.critic_loss_record = deque(maxlen=self.smooth_length)
134 |         self.entropy_record = deque(maxlen=self.smooth_length)
135 |         self.ep_rewards = defaultdict(list)
136 |         self.ep_actions = defaultdict(list)
137 | 
138 |     @abstractmethod
139 |     def build_model(self, config):
140 |         raise NotImplementedError
141 | 
142 |     def _calc_dim(self, model):
143 |         x = torch.randn([1] + list(self.state_shape))
144 |         x = model(x)
145 |         return x.size(-1)
146 | 
147 |     def predict(self, obs, training=True):
148 |         if self.processor is not None:
149 |             obs = [self.processor.process(obs_i) for obs_i in obs]
150 |         state = self.memory.get_recent_state(obs)
151 |         state_tensor = torch.tensor(state, dtype=torch.float,
152 |                                     device=self.device)
153 |         dist, value = self.ac_model(state_tensor)
154 |         action = dist.sample()
155 |         if training:
156 |             log_prob = dist.log_prob(action)
157 |             entropy = dist.entropy()
158 |             self.memory.store_value_log_prob(value, log_prob, entropy)
159 |         return action.cpu().numpy()
160 | 
161 |     def observe(self, obs, action, reward, terminal, info, training=True):
162 |         obs = [self.processor.process(obs_i) for obs_i in obs]
163 |         self.memory.append(obs, action, reward, terminal, training)
164 |         self.record(action, reward, terminal)
165 | 
166 |     def record(self, action, reward, terminal):
167 |         n_workers = len(action)
168 |         for i in range(n_workers):
169 |             self.reward_record[i].append(reward[i])
170 |             self.ep_rewards[i].append(reward[i])
171 |             self.ep_actions[i].append(action[i])
172 |             if terminal[i]:
173 |                 ep_sum_reward = np.sum(self.ep_rewards[i])
174 |                 self.writer.add_scalar(f'data/episode_reward_sum_{i}',
175 |                                        ep_sum_reward,
176 |                                        self.episode_steps[i])
177 | 
178 |                 self.writer.add_histogram(f'data/episode_action_{i}',
179 |                                           np.array(self.ep_actions[i]),
180 |                                           self.episode_steps[i], bins='auto')
181 | 
182 |                 self.writer.add_histogram(f'data/episode_reward_dist_{i}',
183 |                                           np.array(self.ep_rewards[i]),
184 |                                           self.episode_steps[i], bins='auto')
185 | 
186 |                 # Reset record
187 |                 self.episode_steps[i] += 1
188 |                 self.ep_rewards[i] = []
189 |                 self.ep_actions[i] = []
190 |         self.record_step += 1
191 | 
192 |     def set_new_obs(self, new_obs):
193 |         if self.processor is not None:
194 |             new_obs = [self.processor.process(obs_i) for obs_i in new_obs]
195 |         self.new_obs = new_obs
196 | 
197 |     def get_newest_state(self):
198 |         return self.memory.get_recent_state(self.new_obs)
199 | 
200 |     def aggregate_experiences(self):
201 |         experiences = self.memory.sample()
202 |         rewards = torch.tensor(np.array(experiences.reward),
203 |                                dtype=torch.float,
204 |                                device=self.device)
205 |         masks = torch.tensor(1. - np.array(experiences.terminal, dtype=float),
206 |                              device=self.device,
207 |                              dtype=torch.float)
208 |         values = torch.tensor(torch.stack(experiences.value),
209 |                               dtype=torch.float,
210 |                               device=self.device)
211 |         values = torch.sum(values, -1)
212 |         log_probs = torch.tensor(torch.stack(experiences.log_prob),
213 |                                  dtype=torch.float,
214 |                                  device=self.device)
215 | 
216 |         entropies = torch.tensor(torch.stack(experiences.entropy),
217 |                                  dtype=torch.float,
218 |                                  device=self.device)
219 | 
220 |         T = len(rewards)
221 |         # Get delta
222 |         deltas = []
223 |         for t in range(T - 1):
224 |             target = (rewards[t] + values[t + 1] * masks[t]).detach()
225 |             delta = target - values[t]
226 |             deltas.append(delta)
227 |         # Estaimte with the newest value
228 |         new_state = torch.tensor(self.get_newest_state(),
229 |                                  dtype=torch.float,
230 |                                  device=self.device)
231 |         new_value = self.ac_model(new_state)[1].sum(-1)
232 |         new_target = (rewards[-1] + new_value * masks[-1]).detach()
233 |         new_delta = new_target - values[-1]
234 |         deltas.append(new_delta)
235 | 
236 |         # Calculate advantage from deltas
237 |         decay_rate = self.discount * self.gae_lambda
238 |         advs = []
239 |         for t_st in range(T):
240 |             adv = 0.
241 |             power = 0.
242 |             for t in range(t_st, T):
243 |                 adv += deltas[t] * (decay_rate ** power)
244 |                 power += 1.
245 |             advs.append(adv)
246 |         advs = torch.stack(advs)
247 |         self.memory.reset()
248 |         return advs, log_probs, entropies


--------------------------------------------------------------------------------
/rltorch/agents/ppo.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.distributions as dist
  4 | import numpy as np
  5 | 
  6 | from .core import ACAgent
  7 | from ..layers import Flatten, ACModel
  8 | 
  9 | 
 10 | class PPOAgent(ACAgent):
 11 |     """Proximal Policy Optimization
 12 | 
 13 |     n_epochs: int
 14 |         The number of steps to fit models after collecting data
 15 |     clip_eps: float
 16 |         Clip parameter for loss function
 17 |     """
 18 |     def __init__(self, state_shape, action_config, processor=None,
 19 |                  reward_reshape=None, smooth_length=100, log_dir='./logs',
 20 |                  window_length=4, lr=7e-4, model_config=None,
 21 |                  action_dist=dist.Categorical, discount=0.99, gae_lambda=0.95,
 22 |                  num_frames_per_proc=32, batch_size=32, entropy_coef=0.01,
 23 |                  value_loss_coef=0.2,
 24 |                  max_grad_norm=None, clip_eps=0.2, n_epochs=4):
 25 | 
 26 |         super(PPOAgent, self).__init__(state_shape, action_config, processor,
 27 |                                        reward_reshape, smooth_length, log_dir,
 28 |                                        window_length, lr, model_config,
 29 |                                        action_dist, discount,
 30 |                                        gae_lambda, num_frames_per_proc,
 31 |                                        batch_size,
 32 |                                        entropy_coef, value_loss_coef,
 33 |                                        max_grad_norm)
 34 |         self.clip_eps = clip_eps
 35 |         self.n_epochs = n_epochs
 36 | 
 37 |     def fit(self, *args, **kwargs):
 38 |         if self.memory.nb_states < self.num_frames_per_proc:
 39 |             return
 40 |         advs, log_probs, entropies = self.aggregate_experiences()
 41 |         T = advs.size(0)
 42 |         for epoch in range(self.n_epochs):
 43 |             for idx in range(T // self.batch_size):
 44 |                 t_st = idx * self.batch_size
 45 |                 t_end = (idx + 1) * self.batch_size
 46 |                 batch_advs = advs[t_st:t_end].view(-1)
 47 |                 batch_log_probs = log_probs[t_st:t_end].view(-1)
 48 |                 batch_entropies = entropies[t_st:t_end].view(-1)
 49 |                 # Actor Training
 50 |                 old_log_probs = batch_log_probs.detach()
 51 |                 ratio = torch.exp(batch_log_probs - old_log_probs)
 52 |                 # Use advantage as constanats when optimizing the policy
 53 |                 advs_const = batch_advs.detach()
 54 |                 surr1 = ratio * advs_const
 55 |                 surr2 = torch.clamp(ratio, 1.0 - self.clip_eps,
 56 |                                     1.0 + self.clip_eps) * advs_const
 57 |                 actor_loss = -torch.min(surr1, surr2).mean()
 58 |                 # Critic Training
 59 |                 batch_critic_loss = (batch_advs ** 2).mean()
 60 |                 # Entropy regularization
 61 |                 batch_entropy = batch_entropies.mean()
 62 |                 # Total Loss
 63 |                 loss = actor_loss \
 64 |                         + self.value_loss_coef * batch_critic_loss \
 65 |                         - self.entropy_coef * batch_entropy
 66 |                 # Optimizer Model
 67 |                 self.optimizer.zero_grad()
 68 |                 # Need to keep intermediate results for multiple loops
 69 |                 loss.backward(retain_graph=True)
 70 |                 # Clip Gradient
 71 |                 if self.max_grad_norm is not None:
 72 |                     nn.utils.clip_grad_norm(self.parameters, self.max_grad_norm)
 73 |                 self.optimizer.step()
 74 |                 self.loss_record.append(loss.item())
 75 |                 self.actor_loss_record.append(actor_loss.item())
 76 |                 self.critic_loss_record.append(batch_critic_loss.item())
 77 |                 self.entropy_record.append(batch_entropy.item())
 78 |         self.writer.add_scalar(f'data/loss', np.mean(self.loss_record),
 79 |                                self.record_step)
 80 |         self.writer.add_scalar(f'data/actor_loss', np.mean(self.actor_loss_record),
 81 |                                self.record_step)
 82 |         self.writer.add_scalar(f'data/critic_loss', np.mean(self.critic_loss_record),
 83 |                                self.record_step)
 84 |         self.writer.add_scalar(f'data/entropy', np.mean(self.entropy_record),
 85 |                                self.record_step)
 86 |         for key in self.reward_record.keys():
 87 |             self.writer.add_scalar(f'data/reward_{key}',
 88 |                                    np.mean(self.reward_record[key]),
 89 |                                    self.record_step)
 90 | 
 91 |     def build_model(self, config=None):
 92 |         # Share layer
 93 |         model = nn.Sequential()
 94 |         in_features = self.state_shape[0]
 95 |         model.add_module('conv1', nn.Conv2d(in_features, 32, 8, stride=4))
 96 |         model.add_module('relu1', nn.ReLU())
 97 |         model.add_module('conv2', nn.Conv2d(32, 64, 4, stride=2))
 98 |         model.add_module('relu2', nn.ReLU())
 99 |         model.add_module('conv3', nn.Conv2d(64, 64, 3, stride=1))
100 |         model.add_module('relu3', nn.ReLU())
101 |         model.add_module('flatten', Flatten())
102 |         # Calculate dimension after passing test data
103 |         dim = self._calc_dim(model)
104 |         # Fully connected layers
105 |         model.add_module('fc1', nn.Linear(dim, 512))
106 |         model.add_module('relu4', nn.ReLU())
107 | 
108 |         # Actor layer
109 |         actor_model = nn.Sequential()
110 |         actor_model.add_module('actor_fc', nn.Linear(512, self.action_config['n_action']))
111 |         actor_model.add_module('softmax', nn.Softmax())
112 | 
113 |         # Value layer
114 |         value_model = nn.Sequential()
115 |         value_model.add_module('value_fc', nn.Linear(512, 1))
116 |         # Combine all models
117 |         ac_model = ACModel(model, actor_model, value_model, self.action_dist)
118 |         return ac_model
119 | 


--------------------------------------------------------------------------------
/rltorch/agents/random.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .core import BaseAgent
 4 | 
 5 | 
 6 | class RandomAgent(BaseAgent):
 7 |     def __init__(self, state_shape, action_config):
 8 |         super(RandomAgent, self).__init__(state_shape, action_config)
 9 | 
10 |     def predict(self, observation):
11 |         if self.action_config['type'] == 'integer':
12 |             shape = self.action_config.get('shape', None)
13 |             n_action = self.action_config['n_action']
14 |             actions = np.random.randint(0, n_action, shape)
15 |         else:
16 |             raise NotImplementedError
17 |         return actions


--------------------------------------------------------------------------------
/rltorch/env.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from multiprocessing import Process, Pipe
 3 | import gym
 4 | 
 5 | 
 6 | def worker(conn, env):
 7 |     """Execute cmd sent from remote process
 8 | 
 9 |     Parameters
10 |     ----------
11 |     conn: multiprocess.Connection instance
12 |         Supposed to recieve command and data from remote process
13 |     env: gym.Env instance
14 |     """
15 |     try:
16 |         while True:
17 |             cmd, data = conn.recv()
18 |             if cmd == 'step':
19 |                 obs, reward, done, info = env.step(data)
20 |                 if done:
21 |                     obs = env.reset()
22 |                 conn.send((obs, reward, done, info))
23 |             elif cmd == 'reset':
24 |                 obs = env.reset()
25 |                 conn.send(obs)
26 |             elif cmd == 'render':
27 |                 env.render()
28 |             elif cmd == 'close':
29 |                 env.close()
30 |                 conn.close()
31 |                 break
32 |             else:
33 |                 raise NotImplementedError
34 |     except KeyboardInterrupt:
35 |         print("KeyboardInterupt")
36 |     finally:
37 |         env.close()
38 | 
39 | 
40 | class ParallelEnv(gym.Env):
41 |     """A concurrent execution of environments in multiple processes
42 | 
43 |     Parameters
44 |     ----------
45 |     envs: list(gym.Env)
46 |         The list of the same gym environment
47 |     """
48 | 
49 |     def __init__(self, envs):
50 |         assert len(envs) >= 1, 'No environment given'
51 | 
52 |         self.envs = envs
53 |         self.observation_space = self.envs[0].observation_space
54 |         self.action_space = self.envs[0].action_space
55 | 
56 |         # Only index 0 environment runs as a non-daemon process
57 |         self.locals = []
58 |         for env in self.envs[1:]:
59 |             local, remote = Pipe()
60 |             self.locals.append(local)
61 |             p = Process(target=worker, args=(remote, env))
62 |             # Activate a remote worker as a daemon = True
63 |             p.start()
64 |             remote.close()
65 | 
66 |     def reset(self):
67 |         for local in self.locals:
68 |             local.send(('reset', None))
69 |         results = [self.envs[0].reset()] + [local.recv() for local in self.locals]
70 |         return results
71 | 
72 |     def step(self, actions):
73 |         for local, action in zip(self.locals, actions[1:]):
74 |             local.send(('step', action))
75 |         # 0 index process
76 |         obs, reward, done, info = self.envs[0].step(actions[0])
77 |         if done:
78 |             obs = self.envs[0].reset()
79 |         # results = [(obs_1, obs_2, .., obs_n), ..., (info_1, info2, ..., info_n)]
80 |         results = zip(*[(obs, reward, done, info)] + [local.recv() for local in self.locals])
81 |         return results
82 | 
83 |     def close(self):
84 |         for local in self.locals:
85 |             local.send(('close', None))
86 |             local.close()
87 |         self.envs[0].close()
88 | 
89 |     def render(self, render_all=False):
90 |         if render_all:
91 |             for local in self.locals:
92 |                 local.send(('render', None))
93 |             self.envs[0].render()
94 |         else:
95 |             self.envs[0].render()


--------------------------------------------------------------------------------
/rltorch/layers.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class Flatten(nn.Module):
 5 |     def forward(self, x):
 6 |         x = x.view(x.size()[0], -1)
 7 |         return x
 8 | 
 9 | 
10 | class ACModel(nn.Module):
11 |     def __init__(self, share_model, actor_model, value_model, action_dist=None):
12 |         super(ACModel, self).__init__()
13 |         self.share_model = share_model
14 |         self.actor_model = actor_model
15 |         self.value_model = value_model
16 |         self.action_dist = action_dist
17 | 
18 |     def forward(self, x):
19 |         x = self.share_model(x)
20 |         action = self.actor_model(x)
21 |         value = self.value_model(x)
22 |         if self.action_dist is not None:
23 |             action = self.action_dist(action)
24 |         return action, value
25 | 


--------------------------------------------------------------------------------
/rltorch/memories.py:
--------------------------------------------------------------------------------
  1 | from collections import deque, namedtuple
  2 | import warnings
  3 | import random
  4 | from six.moves import xrange
  5 | import itertools
  6 | 
  7 | import numpy as np
  8 | 
  9 | # This is to be understood as a transition: Given `state0`, performing `action`
 10 | # yields `reward` and results in `state1`, which might be `terminal`.
 11 | Experience = namedtuple('Experience',
 12 |                         'state0, action, reward, state1, terminal1')
 13 | 
 14 | ACExperience = namedtuple('ACExperience',
 15 |                           'action, reward, terminal, value, log_prob, entropy')
 16 | 
 17 | 
 18 | def sample_batch_indexes(low, high, size):
 19 |     """Return a sample of (size) unique elements between low and high
 20 |         # Argument
 21 |             low (int): The minimum value for our samples
 22 |             high (int): The maximum value for our samples
 23 |             size (int): The number of samples to pick
 24 |         # Returns
 25 |             A list of samples of length size, with values between low and high
 26 |         """
 27 |     if high - low >= size:
 28 |         # We have enough data. Draw without replacement, that is each index is unique in the
 29 |         # batch. We cannot use `np.random.choice` here because it is horribly inefficient as
 30 |         # the memory grows. See https://github.com/numpy/numpy/issues/2764 for a discussion.
 31 |         # `random.sample` does the same thing (drawing without replacement) and is way faster.
 32 |         r = xrange(low, high)
 33 |         batch_idxs = random.sample(r, size)
 34 |     else:
 35 |         # Not enough data. Help ourselves with sampling from the range, but the same index
 36 |         # can occur multiple times. This is not good and should be avoided by picking a
 37 |         # large enough warm-up phase.
 38 |         warnings.warn(
 39 |             'Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')
 40 |         batch_idxs = np.random.random_integers(low, high - 1, size=size)
 41 |     assert len(batch_idxs) == size
 42 |     return batch_idxs
 43 | 
 44 | 
 45 | class RingBuffer(object):
 46 |     """Erase the oldest memory after reaching maxlen
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |     maxlen: int
 51 |         The maximum number of memory
 52 |     """
 53 |     def __init__(self, maxlen):
 54 |         self.maxlen = maxlen
 55 |         self.data = deque(maxlen=maxlen)
 56 | 
 57 |     def __len__(self):
 58 |         return self.length()
 59 | 
 60 |     def __getitem__(self, idx):
 61 |         """Return element of buffer at specific index
 62 |         # Argument
 63 |             idx (int): Index wanted
 64 |         # Returns
 65 |             The element of buffer at given index
 66 |         """
 67 |         if idx < 0 or idx >= self.length():
 68 |             raise KeyError()
 69 |         return self.data[idx]
 70 | 
 71 |     def append(self, v):
 72 |         """Append an element to the buffer
 73 |         # Argument
 74 |             v (object): Element to append
 75 |         """
 76 |         self.data.append(v)
 77 | 
 78 |     def length(self):
 79 |         """Return the length of Deque
 80 |         # Argument
 81 |             None
 82 |         # Returns
 83 |             The lenght of deque element
 84 |         """
 85 |         return len(self.data)
 86 | 
 87 | 
 88 | def zeroed_observation(observation):
 89 |     """Return an array of zeros with same shape as given observation
 90 |     # Argument
 91 |         observation (list): List of observation
 92 | 
 93 |     # Return
 94 |         A np.ndarray of zeros with observation.shape
 95 |     """
 96 |     if hasattr(observation, 'shape'):
 97 |         return np.zeros(observation.shape)
 98 |     elif hasattr(observation, '__iter__'):
 99 |         out = []
100 |         for x in observation:
101 |             out.append(zeroed_observation(x))
102 |         return out
103 |     else:
104 |         return 0.
105 | 
106 | 
107 | class Memory(object):
108 |     """Base class for memory
109 | 
110 |     Parameters
111 |     ----------
112 |     window_length: int
113 |         The length to be used for input
114 |     ignore_episode_boundaries: bool
115 |         If False, terminal is used without considering the boundary
116 |         when sampling or getting recdent state
117 |     """
118 |     def __init__(self, window_length, ignore_episode_boundaries=False):
119 |         self.window_length = window_length
120 |         self.ignore_episode_boundaries = ignore_episode_boundaries
121 | 
122 |         self.recent_observations = deque(maxlen=window_length)
123 |         self.recent_terminals = deque(maxlen=window_length)
124 | 
125 |     def sample(self, batch_size, batch_idxs=None):
126 |         raise NotImplementedError()
127 | 
128 |     def append(self, observation, action, reward, terminal, training=True):
129 |         # We do not store the final state
130 |         self.recent_observations.append(observation)
131 |         self.recent_terminals.append(terminal)
132 | 
133 |     def get_recent_state(self, current_observation):
134 |         """Return list of last observations
135 | 
136 | 
137 |         Parameters
138 |         ----------
139 |         current_observation: array-like
140 |             Last observation
141 | 
142 |         Returns
143 |         -------
144 |         A list of the last observations
145 |         """
146 |         # This code is slightly complicated by the fact that subsequent observations might be
147 |         # from different episodes. We ensure that an experience never spans multiple episodes.
148 |         # This is probably not that important in practice but it seems cleaner.
149 |         state = [current_observation]
150 |         idx = len(self.recent_observations) - 1
151 |         for offset in range(0, self.window_length - 1):
152 |             current_idx = idx - offset
153 |             # Order: observation => action => (reward, terminal, info)
154 |             if current_idx >= 0:
155 |                 current_terminal = self.recent_terminals[current_idx]
156 |             else:
157 |                 break
158 |             if not self.ignore_episode_boundaries and current_terminal:
159 |                 # The previously handled observation was terminal, don't add the current one.
160 |                 # Otherwise we would leak into a different episode.
161 |                 break
162 |             state.insert(0, self.recent_observations[current_idx])
163 |         while len(state) < self.window_length:
164 |             state.insert(0, zeroed_observation(state[0]))
165 |         state = np.concatenate(state, 0)
166 |         return state
167 | 
168 |     def get_config(self):
169 |         """Return configuration (window_length, ignore_episode_boundaries) for Memory
170 | 
171 |         # Return
172 |             A dict with keys window_length and ignore_episode_boundaries
173 |         """
174 |         config = {
175 |             'window_length': self.window_length,
176 |             'ignore_episode_boundaries': self.ignore_episode_boundaries,
177 |         }
178 |         return config
179 | 
180 | 
181 | class SequentialMemory(Memory):
182 |     def __init__(self, limit, **kwargs):
183 |         super(SequentialMemory, self).__init__(**kwargs)
184 | 
185 |         self.limit = limit
186 | 
187 |         # Do not use deque to implement the memory. This data structure may seem convenient but
188 |         # it is way too slow on random access. Instead, we use our own ring buffer implementation.
189 |         self.actions = RingBuffer(limit)
190 |         self.rewards = RingBuffer(limit)
191 |         self.terminals = RingBuffer(limit)
192 |         self.observations = RingBuffer(limit)
193 | 
194 |     def sample(self, batch_size, batch_idxs=None):
195 |         """Return a randomized batch of experiences
196 |         # Argument
197 |             batch_size (int): Size of the all batch
198 |             batch_idxs (int): Indexes to extract
199 |         # Returns
200 |             A list of experiences randomly selected
201 |         """
202 |         # It is not possible to tell whether the first state in the memory is terminal, because it
203 |         # would require access to the "terminal" flag associated to the previous state. As a result
204 |         # we will never return this first state (only using `self.terminals[0]` to know whether the
205 |         # second state is terminal).
206 |         # In addition we need enough entries to fill the desired window length.
207 |         assert self.nb_entries >= self.window_length + 2, 'not enough entries in the memory'
208 | 
209 |         if batch_idxs is None:
210 |             # Draw random indexes such that we have enough entries before each index to fill the
211 |             # desired window length.
212 |             batch_idxs = sample_batch_indexes(
213 |                 self.window_length, self.nb_entries - 1, size=batch_size)
214 |         batch_idxs = np.array(batch_idxs) + 1
215 |         assert np.min(batch_idxs) >= self.window_length + 1
216 |         assert np.max(batch_idxs) < self.nb_entries
217 |         assert len(batch_idxs) == batch_size
218 | 
219 |         # Create experiences
220 |         experiences = []
221 |         for idx in batch_idxs:
222 |             terminal0 = self.terminals[idx - 2]
223 |             while terminal0:
224 |                 # Skip this transition because the environment was reset here. Select a new, random
225 |                 # transition and use this instead. This may cause the batch to contain the same
226 |                 # transition twice.
227 |                 idx = \
228 |                 sample_batch_indexes(self.window_length + 1, self.nb_entries,
229 |                                      size=1)[0]
230 |                 terminal0 = self.terminals[idx - 2]
231 |             assert self.window_length + 1 <= idx < self.nb_entries
232 | 
233 |             # This code is slightly complicated by the fact that subsequent observations might be
234 |             # from different episodes. We ensure that an experience never spans multiple episodes.
235 |             # This is probably not that important in practice but it seems cleaner.
236 |             state0 = [self.observations[idx - 1]]
237 |             for offset in range(0, self.window_length - 1):
238 |                 current_idx = idx - 2 - offset
239 |                 assert current_idx >= 1
240 |                 current_terminal = self.terminals[current_idx - 1]
241 |                 if current_terminal and not self.ignore_episode_boundaries:
242 |                     # The previously handled observation was terminal, don't add the current one.
243 |                     # Otherwise we would leak into a different episode.
244 |                     break
245 |                 state0.insert(0, self.observations[current_idx])
246 |             while len(state0) < self.window_length:
247 |                 state0.insert(0, zeroed_observation(state0[0]))
248 |             action = self.actions[idx - 1]
249 |             reward = self.rewards[idx - 1]
250 |             terminal1 = self.terminals[idx - 1]
251 | 
252 |             # Okay, now we need to create the follow-up state. This is state0 shifted on timestep
253 |             # to the right. Again, we need to be careful to not include an observation from the next
254 |             # episode if the last state is terminal.
255 |             state1 = [np.copy(x) for x in state0[1:]]
256 |             state1.append(self.observations[idx])
257 |             state0 = np.concatenate(state0, 0)
258 |             state1 = np.concatenate(state1, 0)
259 | 
260 |             assert len(state0) == self.window_length
261 |             assert len(state1) == len(state0)
262 |             experiences.append(
263 |                 Experience(state0=state0, action=action, reward=reward,
264 |                            state1=state1, terminal1=terminal1))
265 |         assert len(experiences) == batch_size
266 |         return experiences
267 | 
268 |     def append(self, observation, action, reward, terminal, training=True):
269 |         """Append an observation to the memory
270 |         # Argument
271 |             observation (dict): Observation returned by environment
272 |             action (int): Action taken to obtain this observation
273 |             reward (float): Reward obtained by taking this action
274 |             terminal (boolean): Is the state terminal
275 |         """
276 |         super(SequentialMemory, self).append(observation, action, reward,
277 |                                              terminal, training=training)
278 | 
279 |         # This needs to be understood as follows: in `observation`, take `action`, obtain `reward`
280 |         # and weather the next state is `terminal` or not.
281 |         if training:
282 |             self.observations.append(observation)
283 |             self.actions.append(action)
284 |             self.rewards.append(reward)
285 |             self.terminals.append(terminal)
286 | 
287 |     @property
288 |     def nb_entries(self):
289 |         """Return number of observations
290 | 
291 |         Returns
292 |         -------
293 |         The number of observations
294 |         """
295 |         return len(self.rewards)
296 | 
297 |     @property
298 |     def nb_states(self):
299 |         """Return number of observations
300 | 
301 |         Returns
302 |         -------
303 |         The number of usable states
304 |         """
305 |         return len(self.observations) - self.window_length + 1
306 | 
307 |     def get_config(self):
308 |         """Return configurations of SequentialMemory
309 | 
310 |         Returns
311 |         -------
312 |         Dict of Config
313 |         """
314 |         config = super(SequentialMemory, self).get_config()
315 |         config['limit'] = self.limit
316 |         return config
317 | 
318 | 
319 | class ACMemory(SequentialMemory):
320 |     def __init__(self, num_frames_per_proc, window_length, **kwargs):
321 |         limit = num_frames_per_proc + window_length - 1
322 |         super(ACMemory, self).__init__(limit, window_length=window_length, **kwargs)
323 |         self.limit = limit
324 |         self.num_frames_per_proc = num_frames_per_proc
325 |         self.reset()
326 | 
327 |     def get_recent_state(self, current_observation):
328 |         """Return list of last observations
329 | 
330 | 
331 |         Parameters
332 |         ----------
333 |         current_observation: tuple(array-like)
334 |             Each element corresponds to observation for each agent
335 | 
336 |         Returns
337 |         -------
338 |         Array of the last observations of agents
339 |         """
340 | 
341 |         state_list = []
342 |         n_workers = len(current_observation)
343 |         for worker_idx in range(n_workers):
344 |             state = self._get_recent_state(current_observation, worker_idx)
345 |             state_list.append(state)
346 |         return np.stack(state_list)
347 | 
348 |     def _get_recent_state(self, current_observation, worker_idx):
349 |         state = [current_observation[worker_idx]]
350 |         idx = len(self.recent_observations) - 1
351 |         for offset in range(0, self.window_length - 1):
352 |             current_idx = idx - offset
353 |             if current_idx >= 0:
354 |                 current_terminal = self.recent_terminals[current_idx][worker_idx]
355 |             else:
356 |                 break
357 |             if not self.ignore_episode_boundaries and current_terminal:
358 |                 break
359 |             state.insert(0, self.recent_observations[current_idx][worker_idx])
360 |         while len(state) < self.window_length:
361 |             state.insert(0, zeroed_observation(state[0]))
362 |         state = np.concatenate(state, 0)
363 |         return state
364 | 
365 |     def store_value_log_prob(self, value, log_prob, entropy):
366 |         self.values.append(value)
367 |         self.log_probs.append(log_prob)
368 |         self.entropies.append(entropy)
369 | 
370 |     def sample(self):
371 |         return ACExperience(action=self.actions[-self.num_frames_per_proc:],
372 |                             reward=self.rewards[-self.num_frames_per_proc:],
373 |                             terminal=self.terminals[-self.num_frames_per_proc:],
374 |                             value=self.values[-self.num_frames_per_proc:],
375 |                             log_prob=self.log_probs[-self.num_frames_per_proc:],
376 |                             entropy=self.entropies[-self.num_frames_per_proc:])
377 | 
378 | 
379 |     def reset(self):
380 |         self.actions = list()
381 |         self.rewards = list()
382 |         self.terminals = list()
383 |         self.observations = list()
384 |         self.values = list()
385 |         self.log_probs = list()
386 |         self.entropies = list()
387 | 
388 |     def get_config(self):
389 |         """Return configurations of ACMemory
390 | 
391 |         Returns
392 |         -------
393 |         Dict of Config
394 |         """
395 |         config = super(SequentialMemory, self).get_config()
396 |         config['limit'] = self.limit
397 |         return config
398 | 
399 | 


--------------------------------------------------------------------------------
/rltorch/processors/__init__.py:
--------------------------------------------------------------------------------
1 | from .atari import AtariProcessor
2 | 


--------------------------------------------------------------------------------
/rltorch/processors/atari.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from skimage.color import rgb2gray
 3 | from skimage.transform import resize
 4 | 
 5 | from .core import BaseProcessor
 6 | 
 7 | 
 8 | class AtariProcessor(BaseProcessor):
 9 |     def __init__(self, frame_width=84, frame_height=84):
10 |         self.frame_width = frame_width
11 |         self.frame_height = frame_height
12 |         self.last_observation = None
13 | 
14 |     def process(self, observation):
15 |         if self.last_observation is not None:
16 |             observation = np.maximum(observation, self.last_observation)
17 |         observation = np.uint8(resize(rgb2gray(observation),
18 |                                       (self.frame_width, self.frame_height)))
19 |         return observation.reshape((1, self.frame_width, self.frame_height))


--------------------------------------------------------------------------------
/rltorch/processors/core.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | 
3 | 
4 | class BaseProcessor(ABC):
5 |     @abstractmethod
6 |     def process(self, observation):
7 |         raise NotImplementedError


--------------------------------------------------------------------------------
/rltorch/runner.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm_notebook, tqdm
 2 | from copy import deepcopy
 3 | 
 4 | from .env import ParallelEnv
 5 | 
 6 | 
 7 | class Runner(object):
 8 |     """Clast to run agent at given environment parallely
 9 | 
10 |     Parameters
11 |     ----------
12 |     env: gym.Env
13 |         OpenAI gym environment
14 |     agent: agent instance
15 |     num_workers: int
16 |         The number of environments to run parallely
17 |     multi: bool
18 |         If True, use parallel environment even if num_workers==1
19 |     """
20 |     def __init__(self, env, agent, num_workers=1, multi=False):
21 |         self.agent = agent
22 |         self.env0 = env
23 |         # Parallelize environment
24 |         if multi or num_workers > 1:
25 |             env = [deepcopy(env) for _ in range(num_workers)]
26 |             self.env = ParallelEnv(env)
27 |         else:
28 |             self.env = env
29 | 
30 |     def simulate(self, n_frames=1e6, training=True, render_freq=0,
31 |                  notebook=False, render_all=False):
32 |         """Run agent
33 | 
34 |         Parameters
35 |         ----------
36 |         n_frames: int
37 |             The number of frames to run
38 |         training: bool
39 |             If True, execute in training mode
40 |         render_freq: int, (default 0)
41 |             If 0, not render environemnt
42 |         notebook: bool
43 |             If True, use iterator of tqdm for notebook
44 | 
45 |         Returns
46 |         -------
47 |         Optimized agent
48 |         """
49 |         n_frames = int(n_frames)
50 |         if notebook:
51 |             iteration = tqdm_notebook(range(n_frames))
52 |         else:
53 |             iteration = tqdm(range(n_frames))
54 |         obs = self.env.reset()
55 |         for step in iteration:
56 |             action = self.agent.predict(obs, training=training)
57 |             new_obs, reward, terminal, info = self.env.step(action)
58 |             self.agent.observe(obs, action, reward, terminal, info,
59 |                                training=training)
60 |             if hasattr(self.agent, 'set_new_obs'):
61 |                 self.agent.set_new_obs(new_obs)
62 |             self.agent.fit()
63 |             obs = new_obs
64 |             if render_freq > 0 and step % render_freq == 0:
65 |                 if isinstance(self.env, ParallelEnv):
66 |                     self.env.render(render_all)
67 |                 else:
68 |                     self.env.render()
69 |         return self.agent
70 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(
 5 |     name="rltorch",
 6 |     version='0.1',
 7 |     description='Reinforcement Learning by PyTorch',
 8 |     author='jjakimoto',
 9 |     author_email='f.j.akimoto@gmail.com',
10 |     packages=find_packages(),
11 |     py_modeuls=["rltorch"]
12 | )


--------------------------------------------------------------------------------