├── .gitignore ├── README.md ├── requirements.txt ├── rltorch ├── __init__.py ├── agents │ ├── __init__.py │ ├── core.py │ ├── ppo.py │ └── random.py ├── env.py ├── layers.py ├── memories.py ├── processors │ ├── __init__.py │ ├── atari.py │ └── core.py └── runner.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | *.pyc 10 | *.swp 11 | 12 | # Packages # 13 | ############ 14 | # it's better to unpack these files and commit the raw source 15 | # git has its own built in compression methods 16 | *.7z 17 | *.dmg 18 | *.gz 19 | *.iso 20 | *.jar 21 | *.rar 22 | *.tar 23 | *.zip 24 | *.meta 25 | *.index 26 | *.ckpt* 27 | # Logs and databases # 28 | ###################### 29 | .ipynb_checkpoints 30 | *.log 31 | *.sql 32 | *.sqlite 33 | /*.egg-info 34 | 35 | # OS generated files # 36 | ###################### 37 | .DS_Store 38 | .DS_Store? 39 | ._* 40 | .Spotlight-V100 41 | .Trashes 42 | ehthumbs.db 43 | Thumbs.db 44 | dist 45 | build 46 | build/* 47 | .idea 48 | params 49 | */data/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PPO PyTorch 2 | Implementation of PPO with PyTorch 3 | 4 | # Installation 5 | To use this repository, you need to install through `setup.py`. 6 | ```buildoutcfg 7 | python setup.py install 8 | ``` 9 | 10 | After installation, you can use the file with `import rltorch`. 11 | 12 | # Examples 13 | ```python 14 | import gym 15 | 16 | from rltorch import Runner 17 | from rltorch.agents import PPOAgent 18 | from rltorch.processors import AtariProcessor 19 | 20 | env = gym.make('Breakout-v0').unwrapped 21 | 22 | FRAME_WIDTH = 84 23 | FRAME_HEIGHT = 84 24 | WINDOW_LENGTH = 4 25 | # state_shape = env.observation_space.shape 26 | state_shape = (WINDOW_LENGTH, FRAME_WIDTH, FRAME_HEIGHT) 27 | action_config = {'n_action': env.action_space.n, 'type': 'integer'} 28 | processor = AtariProcessor(FRAME_WIDTH, FRAME_HEIGHT) 29 | 30 | # Define agent 31 | agent = PPOAgent(state_shape, action_config, processor=processor, 32 | window_length=WINDOW_LENGTH, n_epochs=5, 33 | lr=2.5e-4, entropy_coef=0.01, value_loss_coef=1, 34 | num_frames_per_proc=128) 35 | 36 | # Define execution 37 | runner = Runner(env, agent, num_workers=4, multi=True) 38 | 39 | # Start running 40 | optimzeid_agent = runner.simulate(training=True, notebook=True, render_freq=4) 41 | 42 | ``` 43 | 44 | # Refrences 45 | ### Implementation 46 | * [pytorch-a3c](https://github.com/ikostrikov/pytorch-a3c) 47 | * [baselines](https://github.com/openai/baselines) 48 | 49 | ### Theory 50 | * [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) 51 | * [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) 52 | * [HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION](https://arxiv.org/pdf/1506.02438.pdf) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym 2 | tensorboardX 3 | torch 4 | torchvision -------------------------------------------------------------------------------- /rltorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .runner import Runner 2 | -------------------------------------------------------------------------------- /rltorch/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .random import RandomAgent 2 | from .ppo import PPOAgent -------------------------------------------------------------------------------- /rltorch/agents/core.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import os 3 | import shutil 4 | from abc import abstractmethod 5 | from itertools import chain 6 | from collections import defaultdict, deque 7 | from functools import partial 8 | 9 | import numpy as np 10 | from tensorboardX import SummaryWriter 11 | import torch 12 | import torch.optim as optim 13 | 14 | from ..memories import ACMemory 15 | 16 | 17 | class BaseAgent(ABC): 18 | """Abstract class for Agent 19 | 20 | You need to inherit this class for implementing more detail 21 | 22 | Parameters 23 | ---------- 24 | state_shape: array-like 25 | The shape of input 26 | action_config: dict 27 | Configuration of action space, which may include type, shape, etc. 28 | processor: class 29 | Processor instance to transform input to adjust format of models' input 30 | smooth_length: int 31 | The length to smooth data before recording 32 | log_dir: str 33 | Directory to store the recrod for tensorboard 34 | """ 35 | def __init__(self, state_shape, action_config, processor=None, reward_reshape=None, 36 | smooth_length=100, log_dir='./logs'): 37 | super(BaseAgent, self).__init__() 38 | self.state_shape = state_shape 39 | self.action_config = action_config 40 | self.processor = processor 41 | self.reward_reshape = reward_reshape 42 | self.smooth_length = smooth_length 43 | # Delete old logs if any 44 | if os.path.isdir(log_dir): 45 | print('Delete old tensorboard log') 46 | shutil.rmtree(log_dir) 47 | self.writer = SummaryWriter(log_dir) 48 | self.record_step = 0 49 | self.episode_step = 0 50 | 51 | @abstractmethod 52 | def predict(self, *args, **kwrags): 53 | raise NotImplementedError 54 | 55 | def record(self, *args, **kwargs): 56 | pass 57 | 58 | def observe(self, *args, **kwargs): 59 | pass 60 | 61 | def fit(self, *args, **kwargs): 62 | pass 63 | 64 | 65 | class ACAgent(BaseAgent): 66 | """Actor Critic Base Agent 67 | 68 | sParameters 69 | ---------- 70 | env: gym.Env 71 | OpenAI Gym Environment 72 | state_shape: array-like 73 | The shape of input 74 | action_config: dict 75 | Configuration of action space, which may include type, shape, etc. 76 | processor: class 77 | Processor instance to transform input to adjust format of models' input 78 | smooth_length: int 79 | The length to smooth data before recording 80 | log_dir: str 81 | Directory to store the recrod for tensorboard 82 | window_length: int 83 | lr: float 84 | critic_config: dict 85 | actor_config: dict 86 | action_dist: torch.distributions object 87 | The distribution for action 88 | discount: float 89 | Discount Factor 90 | gae_lambda: float 91 | GAE parameter for trace eligibility 92 | num_frames_per_proc: int 93 | batch_size: int 94 | entropy_coef: float 95 | value_loss_coef: float 96 | max_grad_nrom: float 97 | """ 98 | 99 | def __init__(self, state_shape, action_config, processor, 100 | reward_reshape, smooth_length, log_dir, 101 | window_length, lr, model_config, 102 | action_dist, discount, gae_lambda, num_frames_per_proc, 103 | batch_size, 104 | entropy_coef, value_loss_coef, max_grad_norm): 105 | super(ACAgent, self).__init__(state_shape, action_config, processor, 106 | reward_reshape, 107 | smooth_length, log_dir) 108 | self.widow_length = window_length 109 | self.action_dist = action_dist 110 | self.discount = discount 111 | self.gae_lambda = gae_lambda 112 | self.num_frames_per_proc = num_frames_per_proc 113 | self.batch_size = batch_size 114 | self.entropy_coef = entropy_coef 115 | self.value_loss_coef = value_loss_coef 116 | self.max_grad_norm = max_grad_norm 117 | # Multi Agent Memory 118 | self.memory = ACMemory(num_frames_per_proc, window_length) 119 | # Build Network 120 | self.ac_model = self.build_model(model_config) 121 | # Build optimizer 122 | self.parameters = self.ac_model.parameters() 123 | self.optimizer = optim.Adam(self.parameters, lr=lr) 124 | # Set device 125 | self.device = torch.device( 126 | 'cuda' if torch.cuda.is_available() else 'cpu') 127 | # Record parameters 128 | self.episode_steps = defaultdict(lambda: 0) 129 | mydeque = partial(deque, maxlen=self.smooth_length) 130 | self.reward_record = defaultdict(mydeque) 131 | self.loss_record = deque(maxlen=self.smooth_length) 132 | self.actor_loss_record = deque(maxlen=self.smooth_length) 133 | self.critic_loss_record = deque(maxlen=self.smooth_length) 134 | self.entropy_record = deque(maxlen=self.smooth_length) 135 | self.ep_rewards = defaultdict(list) 136 | self.ep_actions = defaultdict(list) 137 | 138 | @abstractmethod 139 | def build_model(self, config): 140 | raise NotImplementedError 141 | 142 | def _calc_dim(self, model): 143 | x = torch.randn([1] + list(self.state_shape)) 144 | x = model(x) 145 | return x.size(-1) 146 | 147 | def predict(self, obs, training=True): 148 | if self.processor is not None: 149 | obs = [self.processor.process(obs_i) for obs_i in obs] 150 | state = self.memory.get_recent_state(obs) 151 | state_tensor = torch.tensor(state, dtype=torch.float, 152 | device=self.device) 153 | dist, value = self.ac_model(state_tensor) 154 | action = dist.sample() 155 | if training: 156 | log_prob = dist.log_prob(action) 157 | entropy = dist.entropy() 158 | self.memory.store_value_log_prob(value, log_prob, entropy) 159 | return action.cpu().numpy() 160 | 161 | def observe(self, obs, action, reward, terminal, info, training=True): 162 | obs = [self.processor.process(obs_i) for obs_i in obs] 163 | self.memory.append(obs, action, reward, terminal, training) 164 | self.record(action, reward, terminal) 165 | 166 | def record(self, action, reward, terminal): 167 | n_workers = len(action) 168 | for i in range(n_workers): 169 | self.reward_record[i].append(reward[i]) 170 | self.ep_rewards[i].append(reward[i]) 171 | self.ep_actions[i].append(action[i]) 172 | if terminal[i]: 173 | ep_sum_reward = np.sum(self.ep_rewards[i]) 174 | self.writer.add_scalar(f'data/episode_reward_sum_{i}', 175 | ep_sum_reward, 176 | self.episode_steps[i]) 177 | 178 | self.writer.add_histogram(f'data/episode_action_{i}', 179 | np.array(self.ep_actions[i]), 180 | self.episode_steps[i], bins='auto') 181 | 182 | self.writer.add_histogram(f'data/episode_reward_dist_{i}', 183 | np.array(self.ep_rewards[i]), 184 | self.episode_steps[i], bins='auto') 185 | 186 | # Reset record 187 | self.episode_steps[i] += 1 188 | self.ep_rewards[i] = [] 189 | self.ep_actions[i] = [] 190 | self.record_step += 1 191 | 192 | def set_new_obs(self, new_obs): 193 | if self.processor is not None: 194 | new_obs = [self.processor.process(obs_i) for obs_i in new_obs] 195 | self.new_obs = new_obs 196 | 197 | def get_newest_state(self): 198 | return self.memory.get_recent_state(self.new_obs) 199 | 200 | def aggregate_experiences(self): 201 | experiences = self.memory.sample() 202 | rewards = torch.tensor(np.array(experiences.reward), 203 | dtype=torch.float, 204 | device=self.device) 205 | masks = torch.tensor(1. - np.array(experiences.terminal, dtype=float), 206 | device=self.device, 207 | dtype=torch.float) 208 | values = torch.tensor(torch.stack(experiences.value), 209 | dtype=torch.float, 210 | device=self.device) 211 | values = torch.sum(values, -1) 212 | log_probs = torch.tensor(torch.stack(experiences.log_prob), 213 | dtype=torch.float, 214 | device=self.device) 215 | 216 | entropies = torch.tensor(torch.stack(experiences.entropy), 217 | dtype=torch.float, 218 | device=self.device) 219 | 220 | T = len(rewards) 221 | # Get delta 222 | deltas = [] 223 | for t in range(T - 1): 224 | target = (rewards[t] + values[t + 1] * masks[t]).detach() 225 | delta = target - values[t] 226 | deltas.append(delta) 227 | # Estaimte with the newest value 228 | new_state = torch.tensor(self.get_newest_state(), 229 | dtype=torch.float, 230 | device=self.device) 231 | new_value = self.ac_model(new_state)[1].sum(-1) 232 | new_target = (rewards[-1] + new_value * masks[-1]).detach() 233 | new_delta = new_target - values[-1] 234 | deltas.append(new_delta) 235 | 236 | # Calculate advantage from deltas 237 | decay_rate = self.discount * self.gae_lambda 238 | advs = [] 239 | for t_st in range(T): 240 | adv = 0. 241 | power = 0. 242 | for t in range(t_st, T): 243 | adv += deltas[t] * (decay_rate ** power) 244 | power += 1. 245 | advs.append(adv) 246 | advs = torch.stack(advs) 247 | self.memory.reset() 248 | return advs, log_probs, entropies -------------------------------------------------------------------------------- /rltorch/agents/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.distributions as dist 4 | import numpy as np 5 | 6 | from .core import ACAgent 7 | from ..layers import Flatten, ACModel 8 | 9 | 10 | class PPOAgent(ACAgent): 11 | """Proximal Policy Optimization 12 | 13 | n_epochs: int 14 | The number of steps to fit models after collecting data 15 | clip_eps: float 16 | Clip parameter for loss function 17 | """ 18 | def __init__(self, state_shape, action_config, processor=None, 19 | reward_reshape=None, smooth_length=100, log_dir='./logs', 20 | window_length=4, lr=7e-4, model_config=None, 21 | action_dist=dist.Categorical, discount=0.99, gae_lambda=0.95, 22 | num_frames_per_proc=32, batch_size=32, entropy_coef=0.01, 23 | value_loss_coef=0.2, 24 | max_grad_norm=None, clip_eps=0.2, n_epochs=4): 25 | 26 | super(PPOAgent, self).__init__(state_shape, action_config, processor, 27 | reward_reshape, smooth_length, log_dir, 28 | window_length, lr, model_config, 29 | action_dist, discount, 30 | gae_lambda, num_frames_per_proc, 31 | batch_size, 32 | entropy_coef, value_loss_coef, 33 | max_grad_norm) 34 | self.clip_eps = clip_eps 35 | self.n_epochs = n_epochs 36 | 37 | def fit(self, *args, **kwargs): 38 | if self.memory.nb_states < self.num_frames_per_proc: 39 | return 40 | advs, log_probs, entropies = self.aggregate_experiences() 41 | T = advs.size(0) 42 | for epoch in range(self.n_epochs): 43 | for idx in range(T // self.batch_size): 44 | t_st = idx * self.batch_size 45 | t_end = (idx + 1) * self.batch_size 46 | batch_advs = advs[t_st:t_end].view(-1) 47 | batch_log_probs = log_probs[t_st:t_end].view(-1) 48 | batch_entropies = entropies[t_st:t_end].view(-1) 49 | # Actor Training 50 | old_log_probs = batch_log_probs.detach() 51 | ratio = torch.exp(batch_log_probs - old_log_probs) 52 | # Use advantage as constanats when optimizing the policy 53 | advs_const = batch_advs.detach() 54 | surr1 = ratio * advs_const 55 | surr2 = torch.clamp(ratio, 1.0 - self.clip_eps, 56 | 1.0 + self.clip_eps) * advs_const 57 | actor_loss = -torch.min(surr1, surr2).mean() 58 | # Critic Training 59 | batch_critic_loss = (batch_advs ** 2).mean() 60 | # Entropy regularization 61 | batch_entropy = batch_entropies.mean() 62 | # Total Loss 63 | loss = actor_loss \ 64 | + self.value_loss_coef * batch_critic_loss \ 65 | - self.entropy_coef * batch_entropy 66 | # Optimizer Model 67 | self.optimizer.zero_grad() 68 | # Need to keep intermediate results for multiple loops 69 | loss.backward(retain_graph=True) 70 | # Clip Gradient 71 | if self.max_grad_norm is not None: 72 | nn.utils.clip_grad_norm(self.parameters, self.max_grad_norm) 73 | self.optimizer.step() 74 | self.loss_record.append(loss.item()) 75 | self.actor_loss_record.append(actor_loss.item()) 76 | self.critic_loss_record.append(batch_critic_loss.item()) 77 | self.entropy_record.append(batch_entropy.item()) 78 | self.writer.add_scalar(f'data/loss', np.mean(self.loss_record), 79 | self.record_step) 80 | self.writer.add_scalar(f'data/actor_loss', np.mean(self.actor_loss_record), 81 | self.record_step) 82 | self.writer.add_scalar(f'data/critic_loss', np.mean(self.critic_loss_record), 83 | self.record_step) 84 | self.writer.add_scalar(f'data/entropy', np.mean(self.entropy_record), 85 | self.record_step) 86 | for key in self.reward_record.keys(): 87 | self.writer.add_scalar(f'data/reward_{key}', 88 | np.mean(self.reward_record[key]), 89 | self.record_step) 90 | 91 | def build_model(self, config=None): 92 | # Share layer 93 | model = nn.Sequential() 94 | in_features = self.state_shape[0] 95 | model.add_module('conv1', nn.Conv2d(in_features, 32, 8, stride=4)) 96 | model.add_module('relu1', nn.ReLU()) 97 | model.add_module('conv2', nn.Conv2d(32, 64, 4, stride=2)) 98 | model.add_module('relu2', nn.ReLU()) 99 | model.add_module('conv3', nn.Conv2d(64, 64, 3, stride=1)) 100 | model.add_module('relu3', nn.ReLU()) 101 | model.add_module('flatten', Flatten()) 102 | # Calculate dimension after passing test data 103 | dim = self._calc_dim(model) 104 | # Fully connected layers 105 | model.add_module('fc1', nn.Linear(dim, 512)) 106 | model.add_module('relu4', nn.ReLU()) 107 | 108 | # Actor layer 109 | actor_model = nn.Sequential() 110 | actor_model.add_module('actor_fc', nn.Linear(512, self.action_config['n_action'])) 111 | actor_model.add_module('softmax', nn.Softmax()) 112 | 113 | # Value layer 114 | value_model = nn.Sequential() 115 | value_model.add_module('value_fc', nn.Linear(512, 1)) 116 | # Combine all models 117 | ac_model = ACModel(model, actor_model, value_model, self.action_dist) 118 | return ac_model 119 | -------------------------------------------------------------------------------- /rltorch/agents/random.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .core import BaseAgent 4 | 5 | 6 | class RandomAgent(BaseAgent): 7 | def __init__(self, state_shape, action_config): 8 | super(RandomAgent, self).__init__(state_shape, action_config) 9 | 10 | def predict(self, observation): 11 | if self.action_config['type'] == 'integer': 12 | shape = self.action_config.get('shape', None) 13 | n_action = self.action_config['n_action'] 14 | actions = np.random.randint(0, n_action, shape) 15 | else: 16 | raise NotImplementedError 17 | return actions -------------------------------------------------------------------------------- /rltorch/env.py: -------------------------------------------------------------------------------- 1 | 2 | from multiprocessing import Process, Pipe 3 | import gym 4 | 5 | 6 | def worker(conn, env): 7 | """Execute cmd sent from remote process 8 | 9 | Parameters 10 | ---------- 11 | conn: multiprocess.Connection instance 12 | Supposed to recieve command and data from remote process 13 | env: gym.Env instance 14 | """ 15 | try: 16 | while True: 17 | cmd, data = conn.recv() 18 | if cmd == 'step': 19 | obs, reward, done, info = env.step(data) 20 | if done: 21 | obs = env.reset() 22 | conn.send((obs, reward, done, info)) 23 | elif cmd == 'reset': 24 | obs = env.reset() 25 | conn.send(obs) 26 | elif cmd == 'render': 27 | env.render() 28 | elif cmd == 'close': 29 | env.close() 30 | conn.close() 31 | break 32 | else: 33 | raise NotImplementedError 34 | except KeyboardInterrupt: 35 | print("KeyboardInterupt") 36 | finally: 37 | env.close() 38 | 39 | 40 | class ParallelEnv(gym.Env): 41 | """A concurrent execution of environments in multiple processes 42 | 43 | Parameters 44 | ---------- 45 | envs: list(gym.Env) 46 | The list of the same gym environment 47 | """ 48 | 49 | def __init__(self, envs): 50 | assert len(envs) >= 1, 'No environment given' 51 | 52 | self.envs = envs 53 | self.observation_space = self.envs[0].observation_space 54 | self.action_space = self.envs[0].action_space 55 | 56 | # Only index 0 environment runs as a non-daemon process 57 | self.locals = [] 58 | for env in self.envs[1:]: 59 | local, remote = Pipe() 60 | self.locals.append(local) 61 | p = Process(target=worker, args=(remote, env)) 62 | # Activate a remote worker as a daemon = True 63 | p.start() 64 | remote.close() 65 | 66 | def reset(self): 67 | for local in self.locals: 68 | local.send(('reset', None)) 69 | results = [self.envs[0].reset()] + [local.recv() for local in self.locals] 70 | return results 71 | 72 | def step(self, actions): 73 | for local, action in zip(self.locals, actions[1:]): 74 | local.send(('step', action)) 75 | # 0 index process 76 | obs, reward, done, info = self.envs[0].step(actions[0]) 77 | if done: 78 | obs = self.envs[0].reset() 79 | # results = [(obs_1, obs_2, .., obs_n), ..., (info_1, info2, ..., info_n)] 80 | results = zip(*[(obs, reward, done, info)] + [local.recv() for local in self.locals]) 81 | return results 82 | 83 | def close(self): 84 | for local in self.locals: 85 | local.send(('close', None)) 86 | local.close() 87 | self.envs[0].close() 88 | 89 | def render(self, render_all=False): 90 | if render_all: 91 | for local in self.locals: 92 | local.send(('render', None)) 93 | self.envs[0].render() 94 | else: 95 | self.envs[0].render() -------------------------------------------------------------------------------- /rltorch/layers.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class Flatten(nn.Module): 5 | def forward(self, x): 6 | x = x.view(x.size()[0], -1) 7 | return x 8 | 9 | 10 | class ACModel(nn.Module): 11 | def __init__(self, share_model, actor_model, value_model, action_dist=None): 12 | super(ACModel, self).__init__() 13 | self.share_model = share_model 14 | self.actor_model = actor_model 15 | self.value_model = value_model 16 | self.action_dist = action_dist 17 | 18 | def forward(self, x): 19 | x = self.share_model(x) 20 | action = self.actor_model(x) 21 | value = self.value_model(x) 22 | if self.action_dist is not None: 23 | action = self.action_dist(action) 24 | return action, value 25 | -------------------------------------------------------------------------------- /rltorch/memories.py: -------------------------------------------------------------------------------- 1 | from collections import deque, namedtuple 2 | import warnings 3 | import random 4 | from six.moves import xrange 5 | import itertools 6 | 7 | import numpy as np 8 | 9 | # This is to be understood as a transition: Given `state0`, performing `action` 10 | # yields `reward` and results in `state1`, which might be `terminal`. 11 | Experience = namedtuple('Experience', 12 | 'state0, action, reward, state1, terminal1') 13 | 14 | ACExperience = namedtuple('ACExperience', 15 | 'action, reward, terminal, value, log_prob, entropy') 16 | 17 | 18 | def sample_batch_indexes(low, high, size): 19 | """Return a sample of (size) unique elements between low and high 20 | # Argument 21 | low (int): The minimum value for our samples 22 | high (int): The maximum value for our samples 23 | size (int): The number of samples to pick 24 | # Returns 25 | A list of samples of length size, with values between low and high 26 | """ 27 | if high - low >= size: 28 | # We have enough data. Draw without replacement, that is each index is unique in the 29 | # batch. We cannot use `np.random.choice` here because it is horribly inefficient as 30 | # the memory grows. See https://github.com/numpy/numpy/issues/2764 for a discussion. 31 | # `random.sample` does the same thing (drawing without replacement) and is way faster. 32 | r = xrange(low, high) 33 | batch_idxs = random.sample(r, size) 34 | else: 35 | # Not enough data. Help ourselves with sampling from the range, but the same index 36 | # can occur multiple times. This is not good and should be avoided by picking a 37 | # large enough warm-up phase. 38 | warnings.warn( 39 | 'Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!') 40 | batch_idxs = np.random.random_integers(low, high - 1, size=size) 41 | assert len(batch_idxs) == size 42 | return batch_idxs 43 | 44 | 45 | class RingBuffer(object): 46 | """Erase the oldest memory after reaching maxlen 47 | 48 | Parameters 49 | ---------- 50 | maxlen: int 51 | The maximum number of memory 52 | """ 53 | def __init__(self, maxlen): 54 | self.maxlen = maxlen 55 | self.data = deque(maxlen=maxlen) 56 | 57 | def __len__(self): 58 | return self.length() 59 | 60 | def __getitem__(self, idx): 61 | """Return element of buffer at specific index 62 | # Argument 63 | idx (int): Index wanted 64 | # Returns 65 | The element of buffer at given index 66 | """ 67 | if idx < 0 or idx >= self.length(): 68 | raise KeyError() 69 | return self.data[idx] 70 | 71 | def append(self, v): 72 | """Append an element to the buffer 73 | # Argument 74 | v (object): Element to append 75 | """ 76 | self.data.append(v) 77 | 78 | def length(self): 79 | """Return the length of Deque 80 | # Argument 81 | None 82 | # Returns 83 | The lenght of deque element 84 | """ 85 | return len(self.data) 86 | 87 | 88 | def zeroed_observation(observation): 89 | """Return an array of zeros with same shape as given observation 90 | # Argument 91 | observation (list): List of observation 92 | 93 | # Return 94 | A np.ndarray of zeros with observation.shape 95 | """ 96 | if hasattr(observation, 'shape'): 97 | return np.zeros(observation.shape) 98 | elif hasattr(observation, '__iter__'): 99 | out = [] 100 | for x in observation: 101 | out.append(zeroed_observation(x)) 102 | return out 103 | else: 104 | return 0. 105 | 106 | 107 | class Memory(object): 108 | """Base class for memory 109 | 110 | Parameters 111 | ---------- 112 | window_length: int 113 | The length to be used for input 114 | ignore_episode_boundaries: bool 115 | If False, terminal is used without considering the boundary 116 | when sampling or getting recdent state 117 | """ 118 | def __init__(self, window_length, ignore_episode_boundaries=False): 119 | self.window_length = window_length 120 | self.ignore_episode_boundaries = ignore_episode_boundaries 121 | 122 | self.recent_observations = deque(maxlen=window_length) 123 | self.recent_terminals = deque(maxlen=window_length) 124 | 125 | def sample(self, batch_size, batch_idxs=None): 126 | raise NotImplementedError() 127 | 128 | def append(self, observation, action, reward, terminal, training=True): 129 | # We do not store the final state 130 | self.recent_observations.append(observation) 131 | self.recent_terminals.append(terminal) 132 | 133 | def get_recent_state(self, current_observation): 134 | """Return list of last observations 135 | 136 | 137 | Parameters 138 | ---------- 139 | current_observation: array-like 140 | Last observation 141 | 142 | Returns 143 | ------- 144 | A list of the last observations 145 | """ 146 | # This code is slightly complicated by the fact that subsequent observations might be 147 | # from different episodes. We ensure that an experience never spans multiple episodes. 148 | # This is probably not that important in practice but it seems cleaner. 149 | state = [current_observation] 150 | idx = len(self.recent_observations) - 1 151 | for offset in range(0, self.window_length - 1): 152 | current_idx = idx - offset 153 | # Order: observation => action => (reward, terminal, info) 154 | if current_idx >= 0: 155 | current_terminal = self.recent_terminals[current_idx] 156 | else: 157 | break 158 | if not self.ignore_episode_boundaries and current_terminal: 159 | # The previously handled observation was terminal, don't add the current one. 160 | # Otherwise we would leak into a different episode. 161 | break 162 | state.insert(0, self.recent_observations[current_idx]) 163 | while len(state) < self.window_length: 164 | state.insert(0, zeroed_observation(state[0])) 165 | state = np.concatenate(state, 0) 166 | return state 167 | 168 | def get_config(self): 169 | """Return configuration (window_length, ignore_episode_boundaries) for Memory 170 | 171 | # Return 172 | A dict with keys window_length and ignore_episode_boundaries 173 | """ 174 | config = { 175 | 'window_length': self.window_length, 176 | 'ignore_episode_boundaries': self.ignore_episode_boundaries, 177 | } 178 | return config 179 | 180 | 181 | class SequentialMemory(Memory): 182 | def __init__(self, limit, **kwargs): 183 | super(SequentialMemory, self).__init__(**kwargs) 184 | 185 | self.limit = limit 186 | 187 | # Do not use deque to implement the memory. This data structure may seem convenient but 188 | # it is way too slow on random access. Instead, we use our own ring buffer implementation. 189 | self.actions = RingBuffer(limit) 190 | self.rewards = RingBuffer(limit) 191 | self.terminals = RingBuffer(limit) 192 | self.observations = RingBuffer(limit) 193 | 194 | def sample(self, batch_size, batch_idxs=None): 195 | """Return a randomized batch of experiences 196 | # Argument 197 | batch_size (int): Size of the all batch 198 | batch_idxs (int): Indexes to extract 199 | # Returns 200 | A list of experiences randomly selected 201 | """ 202 | # It is not possible to tell whether the first state in the memory is terminal, because it 203 | # would require access to the "terminal" flag associated to the previous state. As a result 204 | # we will never return this first state (only using `self.terminals[0]` to know whether the 205 | # second state is terminal). 206 | # In addition we need enough entries to fill the desired window length. 207 | assert self.nb_entries >= self.window_length + 2, 'not enough entries in the memory' 208 | 209 | if batch_idxs is None: 210 | # Draw random indexes such that we have enough entries before each index to fill the 211 | # desired window length. 212 | batch_idxs = sample_batch_indexes( 213 | self.window_length, self.nb_entries - 1, size=batch_size) 214 | batch_idxs = np.array(batch_idxs) + 1 215 | assert np.min(batch_idxs) >= self.window_length + 1 216 | assert np.max(batch_idxs) < self.nb_entries 217 | assert len(batch_idxs) == batch_size 218 | 219 | # Create experiences 220 | experiences = [] 221 | for idx in batch_idxs: 222 | terminal0 = self.terminals[idx - 2] 223 | while terminal0: 224 | # Skip this transition because the environment was reset here. Select a new, random 225 | # transition and use this instead. This may cause the batch to contain the same 226 | # transition twice. 227 | idx = \ 228 | sample_batch_indexes(self.window_length + 1, self.nb_entries, 229 | size=1)[0] 230 | terminal0 = self.terminals[idx - 2] 231 | assert self.window_length + 1 <= idx < self.nb_entries 232 | 233 | # This code is slightly complicated by the fact that subsequent observations might be 234 | # from different episodes. We ensure that an experience never spans multiple episodes. 235 | # This is probably not that important in practice but it seems cleaner. 236 | state0 = [self.observations[idx - 1]] 237 | for offset in range(0, self.window_length - 1): 238 | current_idx = idx - 2 - offset 239 | assert current_idx >= 1 240 | current_terminal = self.terminals[current_idx - 1] 241 | if current_terminal and not self.ignore_episode_boundaries: 242 | # The previously handled observation was terminal, don't add the current one. 243 | # Otherwise we would leak into a different episode. 244 | break 245 | state0.insert(0, self.observations[current_idx]) 246 | while len(state0) < self.window_length: 247 | state0.insert(0, zeroed_observation(state0[0])) 248 | action = self.actions[idx - 1] 249 | reward = self.rewards[idx - 1] 250 | terminal1 = self.terminals[idx - 1] 251 | 252 | # Okay, now we need to create the follow-up state. This is state0 shifted on timestep 253 | # to the right. Again, we need to be careful to not include an observation from the next 254 | # episode if the last state is terminal. 255 | state1 = [np.copy(x) for x in state0[1:]] 256 | state1.append(self.observations[idx]) 257 | state0 = np.concatenate(state0, 0) 258 | state1 = np.concatenate(state1, 0) 259 | 260 | assert len(state0) == self.window_length 261 | assert len(state1) == len(state0) 262 | experiences.append( 263 | Experience(state0=state0, action=action, reward=reward, 264 | state1=state1, terminal1=terminal1)) 265 | assert len(experiences) == batch_size 266 | return experiences 267 | 268 | def append(self, observation, action, reward, terminal, training=True): 269 | """Append an observation to the memory 270 | # Argument 271 | observation (dict): Observation returned by environment 272 | action (int): Action taken to obtain this observation 273 | reward (float): Reward obtained by taking this action 274 | terminal (boolean): Is the state terminal 275 | """ 276 | super(SequentialMemory, self).append(observation, action, reward, 277 | terminal, training=training) 278 | 279 | # This needs to be understood as follows: in `observation`, take `action`, obtain `reward` 280 | # and weather the next state is `terminal` or not. 281 | if training: 282 | self.observations.append(observation) 283 | self.actions.append(action) 284 | self.rewards.append(reward) 285 | self.terminals.append(terminal) 286 | 287 | @property 288 | def nb_entries(self): 289 | """Return number of observations 290 | 291 | Returns 292 | ------- 293 | The number of observations 294 | """ 295 | return len(self.rewards) 296 | 297 | @property 298 | def nb_states(self): 299 | """Return number of observations 300 | 301 | Returns 302 | ------- 303 | The number of usable states 304 | """ 305 | return len(self.observations) - self.window_length + 1 306 | 307 | def get_config(self): 308 | """Return configurations of SequentialMemory 309 | 310 | Returns 311 | ------- 312 | Dict of Config 313 | """ 314 | config = super(SequentialMemory, self).get_config() 315 | config['limit'] = self.limit 316 | return config 317 | 318 | 319 | class ACMemory(SequentialMemory): 320 | def __init__(self, num_frames_per_proc, window_length, **kwargs): 321 | limit = num_frames_per_proc + window_length - 1 322 | super(ACMemory, self).__init__(limit, window_length=window_length, **kwargs) 323 | self.limit = limit 324 | self.num_frames_per_proc = num_frames_per_proc 325 | self.reset() 326 | 327 | def get_recent_state(self, current_observation): 328 | """Return list of last observations 329 | 330 | 331 | Parameters 332 | ---------- 333 | current_observation: tuple(array-like) 334 | Each element corresponds to observation for each agent 335 | 336 | Returns 337 | ------- 338 | Array of the last observations of agents 339 | """ 340 | 341 | state_list = [] 342 | n_workers = len(current_observation) 343 | for worker_idx in range(n_workers): 344 | state = self._get_recent_state(current_observation, worker_idx) 345 | state_list.append(state) 346 | return np.stack(state_list) 347 | 348 | def _get_recent_state(self, current_observation, worker_idx): 349 | state = [current_observation[worker_idx]] 350 | idx = len(self.recent_observations) - 1 351 | for offset in range(0, self.window_length - 1): 352 | current_idx = idx - offset 353 | if current_idx >= 0: 354 | current_terminal = self.recent_terminals[current_idx][worker_idx] 355 | else: 356 | break 357 | if not self.ignore_episode_boundaries and current_terminal: 358 | break 359 | state.insert(0, self.recent_observations[current_idx][worker_idx]) 360 | while len(state) < self.window_length: 361 | state.insert(0, zeroed_observation(state[0])) 362 | state = np.concatenate(state, 0) 363 | return state 364 | 365 | def store_value_log_prob(self, value, log_prob, entropy): 366 | self.values.append(value) 367 | self.log_probs.append(log_prob) 368 | self.entropies.append(entropy) 369 | 370 | def sample(self): 371 | return ACExperience(action=self.actions[-self.num_frames_per_proc:], 372 | reward=self.rewards[-self.num_frames_per_proc:], 373 | terminal=self.terminals[-self.num_frames_per_proc:], 374 | value=self.values[-self.num_frames_per_proc:], 375 | log_prob=self.log_probs[-self.num_frames_per_proc:], 376 | entropy=self.entropies[-self.num_frames_per_proc:]) 377 | 378 | 379 | def reset(self): 380 | self.actions = list() 381 | self.rewards = list() 382 | self.terminals = list() 383 | self.observations = list() 384 | self.values = list() 385 | self.log_probs = list() 386 | self.entropies = list() 387 | 388 | def get_config(self): 389 | """Return configurations of ACMemory 390 | 391 | Returns 392 | ------- 393 | Dict of Config 394 | """ 395 | config = super(SequentialMemory, self).get_config() 396 | config['limit'] = self.limit 397 | return config 398 | 399 | -------------------------------------------------------------------------------- /rltorch/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from .atari import AtariProcessor 2 | -------------------------------------------------------------------------------- /rltorch/processors/atari.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from skimage.color import rgb2gray 3 | from skimage.transform import resize 4 | 5 | from .core import BaseProcessor 6 | 7 | 8 | class AtariProcessor(BaseProcessor): 9 | def __init__(self, frame_width=84, frame_height=84): 10 | self.frame_width = frame_width 11 | self.frame_height = frame_height 12 | self.last_observation = None 13 | 14 | def process(self, observation): 15 | if self.last_observation is not None: 16 | observation = np.maximum(observation, self.last_observation) 17 | observation = np.uint8(resize(rgb2gray(observation), 18 | (self.frame_width, self.frame_height))) 19 | return observation.reshape((1, self.frame_width, self.frame_height)) -------------------------------------------------------------------------------- /rltorch/processors/core.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class BaseProcessor(ABC): 5 | @abstractmethod 6 | def process(self, observation): 7 | raise NotImplementedError -------------------------------------------------------------------------------- /rltorch/runner.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm_notebook, tqdm 2 | from copy import deepcopy 3 | 4 | from .env import ParallelEnv 5 | 6 | 7 | class Runner(object): 8 | """Clast to run agent at given environment parallely 9 | 10 | Parameters 11 | ---------- 12 | env: gym.Env 13 | OpenAI gym environment 14 | agent: agent instance 15 | num_workers: int 16 | The number of environments to run parallely 17 | multi: bool 18 | If True, use parallel environment even if num_workers==1 19 | """ 20 | def __init__(self, env, agent, num_workers=1, multi=False): 21 | self.agent = agent 22 | self.env0 = env 23 | # Parallelize environment 24 | if multi or num_workers > 1: 25 | env = [deepcopy(env) for _ in range(num_workers)] 26 | self.env = ParallelEnv(env) 27 | else: 28 | self.env = env 29 | 30 | def simulate(self, n_frames=1e6, training=True, render_freq=0, 31 | notebook=False, render_all=False): 32 | """Run agent 33 | 34 | Parameters 35 | ---------- 36 | n_frames: int 37 | The number of frames to run 38 | training: bool 39 | If True, execute in training mode 40 | render_freq: int, (default 0) 41 | If 0, not render environemnt 42 | notebook: bool 43 | If True, use iterator of tqdm for notebook 44 | 45 | Returns 46 | ------- 47 | Optimized agent 48 | """ 49 | n_frames = int(n_frames) 50 | if notebook: 51 | iteration = tqdm_notebook(range(n_frames)) 52 | else: 53 | iteration = tqdm(range(n_frames)) 54 | obs = self.env.reset() 55 | for step in iteration: 56 | action = self.agent.predict(obs, training=training) 57 | new_obs, reward, terminal, info = self.env.step(action) 58 | self.agent.observe(obs, action, reward, terminal, info, 59 | training=training) 60 | if hasattr(self.agent, 'set_new_obs'): 61 | self.agent.set_new_obs(new_obs) 62 | self.agent.fit() 63 | obs = new_obs 64 | if render_freq > 0 and step % render_freq == 0: 65 | if isinstance(self.env, ParallelEnv): 66 | self.env.render(render_all) 67 | else: 68 | self.env.render() 69 | return self.agent 70 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name="rltorch", 6 | version='0.1', 7 | description='Reinforcement Learning by PyTorch', 8 | author='jjakimoto', 9 | author_email='f.j.akimoto@gmail.com', 10 | packages=find_packages(), 11 | py_modeuls=["rltorch"] 12 | ) --------------------------------------------------------------------------------