├── .gitignore ├── LICENSE ├── README.md ├── irl ├── __init__.py ├── airl │ ├── __init__.py │ ├── airl.py │ └── discriminator.py ├── common │ ├── __init__.py │ ├── model.py │ └── utils │ │ ├── __init__.py │ │ ├── get_states_actions_next_states.py │ │ └── mean_or_nan.py └── gail │ ├── __init__.py │ ├── discriminator.py │ └── gail.py ├── requirements.txt └── train_gym.py /.gitignore: -------------------------------------------------------------------------------- 1 | results/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Yusuke Nakata 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Inverse Reinforcement Learning 2 | 3 | ## Contents 4 | 5 | Chainer implementation of Adversarial Inverse Reinforcement Learning (AIRL) and Generative Adversarial Imitation Learning (GAIL). 6 | The code heavily depend on the reinforcement learning package [Chainerrl](https://github.com/chainer/chainerrl). 7 | 8 | ## Commands 9 | 10 | Train and sample expert trajectory 11 | ```bash 12 | python train_gym.py ppo --gpu $gpu_id --env CartPole-v0 --arch FFSoftmax --steps 50000 13 | ``` 14 | 15 | Run GAIL 16 | ```bash 17 | python train_gym.py gail --gpu $gpu_id --env CartPole-v0 --arch FFSoftmax --steps 100000 \ 18 | --load_demo ${PathOfDemonstrationNpzFile} --update-interval 128 --entropy-coef 0.01 19 | ``` 20 | 21 | Run AIRL 22 | ```bash 23 | python train_gym.py airl --gpu $gpu_id --env CartPole-v0 --arch FFSoftmax --steps 100000 \ 24 | --load_demo ${PathOfDemonstrationNpzFile} --update-interval 128 --entropy-coef 0.01 25 | ``` 26 | 27 | ## LICENSE 28 | MIT 29 | -------------------------------------------------------------------------------- /irl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uidilr/deepirl_chainer/45f6134fe457bdae1484e4847ab0701f39940faa/irl/__init__.py -------------------------------------------------------------------------------- /irl/airl/__init__.py: -------------------------------------------------------------------------------- 1 | from irl.airl.airl import AIRL 2 | from irl.airl.discriminator import Discriminator -------------------------------------------------------------------------------- /irl/airl/airl.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import numpy as np 3 | from chainerrl.agents import PPO 4 | from itertools import chain 5 | import collections 6 | from chainerrl.policies import SoftmaxPolicy 7 | 8 | from irl.airl.discriminator import Discriminator 9 | from irl.common.utils.mean_or_nan import mean_or_nan 10 | from irl.common.utils import get_states_actions_next_states 11 | 12 | 13 | 14 | class AIRL(PPO): 15 | def __init__(self, discriminator: Discriminator, demonstrations, discriminator_loss_stats_window=1000, **kwargs): 16 | 17 | super().__init__(**kwargs) 18 | self.discriminator = discriminator 19 | 20 | self.demo_states, self.demo_actions, self.demo_next_states = \ 21 | get_states_actions_next_states(demonstrations['states'], demonstrations['actions'], xp=self.xp) 22 | if isinstance(self.model.pi, SoftmaxPolicy): 23 | # action space is continuous 24 | self.demo_actions = self.demo_actions.astype(dtype=self.xp.int32) 25 | 26 | self.discriminator_loss_record = collections.deque(maxlen=discriminator_loss_stats_window) 27 | self.reward_mean_record = collections.deque(maxlen=discriminator_loss_stats_window) 28 | 29 | def _update(self, dataset): 30 | # override func 31 | xp = self.xp 32 | 33 | if self.obs_normalizer: 34 | self._update_obs_normalizer(dataset) 35 | 36 | dataset_iter = chainer.iterators.SerialIterator(dataset, self.minibatch_size, shuffle=True) 37 | loss_mean = 0 38 | while dataset_iter.epoch < self.epochs: 39 | # create batch for this iter 40 | batch = dataset_iter.__next__() 41 | states = self.batch_states([b['state'] for b in batch], xp, self.phi) 42 | next_states = self.batch_states([b['next_state'] for b in batch], xp, self.phi) 43 | actions = xp.array([b['action'] for b in batch]) 44 | 45 | # create batch of expert data for this iter 46 | demonstrations_indexes = np.random.permutation(len(self.demo_states))[:self.minibatch_size] 47 | demo_states, demo_actions, demo_next_states = [d[demonstrations_indexes] 48 | for d in (self.demo_states, self.demo_actions, 49 | self.demo_next_states)] 50 | 51 | states, demo_states, next_states, demo_next_states = [(self.obs_normalizer(d, update=False) 52 | if self.obs_normalizer else d) 53 | for d in [states, demo_states, 54 | next_states, demo_next_states]] 55 | 56 | with chainer.configuration.using_config('train', False), chainer.no_backprop_mode(): 57 | action_log_probs = self.get_probs(states, actions) 58 | demo_action_log_probs = self.get_probs(demo_states, demo_actions) 59 | 60 | loss = self.discriminator.train(expert_states=demo_states, expert_next_states=demo_next_states, 61 | expert_action_probs=demo_action_log_probs, fake_states=states, 62 | fake_next_states=next_states, fake_action_probs=action_log_probs, 63 | gamma=self.gamma) 64 | loss_mean += loss / (self.epochs * self.minibatch_size) 65 | self.discriminator_loss_record.append(float(loss_mean.array)) 66 | super()._update(dataset) 67 | 68 | def _update_if_dataset_is_ready(self): 69 | # override func 70 | dataset_size = ( 71 | sum(len(episode) for episode in self.memory) 72 | + len(self.last_episode) 73 | + (0 if self.batch_last_episode is None else sum( 74 | len(episode) for episode in self.batch_last_episode))) 75 | if dataset_size >= self.update_interval: 76 | self._flush_last_episode() 77 | 78 | # update reward in self.memory 79 | transitions = list(chain(*self.memory)) 80 | with chainer.configuration.using_config('train', False), chainer.no_backprop_mode(): 81 | rewards = self.discriminator.get_rewards(self.xp.asarray(np.concatenate([transition['state'][None] 82 | for transition in transitions]))).array 83 | self.reward_mean_record.append(float(np.mean(rewards))) 84 | i = 0 85 | for episode in self.memory: 86 | for transition in episode: 87 | transition['reward'] = float(rewards[i]) 88 | i += 1 89 | assert self.memory[0][0]['reward'] == float(rewards[0]), 'rewards is not replaced.' 90 | 91 | dataset = self._make_dataset() 92 | assert len(dataset) == dataset_size 93 | self._update(dataset) 94 | self.memory = [] 95 | 96 | def get_probs(self, states, actions): 97 | target_distribs, _ = self.model(states) 98 | return target_distribs.log_prob(actions) 99 | 100 | def get_statistics(self): 101 | return [('average_discriminator_loss', mean_or_nan(self.discriminator_loss_record)), 102 | ('average_rewards', mean_or_nan(self.reward_mean_record))] + super().get_statistics() 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /irl/airl/discriminator.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | from irl.common.model import MLP 4 | from chainer.link_hooks.spectral_normalization import SpectralNormalization 5 | 6 | 7 | class Discriminator: 8 | def __init__(self, n_layer=4, n_units=32, gpu=-1): 9 | self.reward_net = MLP(n_layer, n_units, 1, hook=SpectralNormalization, hook_params=dict(factor=1)) 10 | self.value_net = MLP(n_layer, n_units, 1) # , hook=SpectralNormalization, hook_params=dict(factor=10)) 11 | if gpu >= 0: 12 | self.reward_net.to_gpu(gpu) 13 | self.value_net.to_gpu(gpu) 14 | # adding spectral normalization with small factor for value net makes training unstable 15 | # but why adversarial loss decreases when we add this to the value net? 16 | # because the lipschitz factor can be bounded by the lipschitz constant of the reward net? 17 | self.reward_optimizer = chainer.optimizers.Adam() 18 | self.value_optimizer = chainer.optimizers.Adam() 19 | self.reward_optimizer.setup(self.reward_net) 20 | self.value_optimizer.setup(self.value_net) 21 | 22 | def __call__(self, x): 23 | return self.reward_net(x), self.value_net(x) 24 | 25 | def train(self, expert_states, expert_next_states, expert_action_probs, fake_states, fake_next_states, 26 | fake_action_probs, gamma): 27 | 28 | def logits(states, next_states, log_action_probs): 29 | # p(expert|state, action) = sigmoid(logits) 30 | rewards = self.reward_net(states) 31 | # print(F.mean(rewards)) 32 | state_values = self.value_net(states) 33 | next_state_values = self.value_net(next_states) 34 | return rewards + gamma * next_state_values - state_values - log_action_probs[:, None].array 35 | 36 | # This parameter stabilise training 37 | # softplus(logits) == log(sigmoid(logits)) 38 | # print('expert: ', end='') 39 | loss = F.mean(F.softplus(-logits(expert_states, expert_next_states, expert_action_probs))) 40 | # print('fake: ', end='') 41 | loss += F.mean(F.softplus(logits(fake_states, fake_next_states, fake_action_probs))) 42 | 43 | # add gradient penalty for reward 44 | # xp = chainer.cuda.get_array_module(expert_states) 45 | # e = xp.random.uniform(0., 1., len(expert_states))[:, None].astype(xp.float32) 46 | # x_hat = chainer.Variable((e * expert_states + (1 - e) * fake_states), requires_grad=True) 47 | # grad, = chainer.grad([self.reward_net(x_hat)], [x_hat], enable_double_backprop=True) 48 | # loss_grad = 0.1 * F.mean(F.sqrt(F.batch_l2_norm_squared(grad))) 49 | # loss += loss_grad 50 | 51 | self.reward_net.cleargrads() 52 | self.value_net.cleargrads() 53 | loss.backward() 54 | self.reward_optimizer.update() 55 | self.value_optimizer.update() 56 | return loss 57 | 58 | def get_rewards(self, x): 59 | return self.reward_net(x) 60 | -------------------------------------------------------------------------------- /irl/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uidilr/deepirl_chainer/45f6134fe457bdae1484e4847ab0701f39940faa/irl/common/__init__.py -------------------------------------------------------------------------------- /irl/common/model.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | import chainer.links as L 4 | 5 | 6 | def pass_fn(x): 7 | return x 8 | 9 | 10 | class MLP(chainer.ChainList): 11 | def __init__(self, n_layer, n_units, n_out, activation=F.leaky_relu, out_activation=pass_fn, hook=None, hook_params=None): 12 | super().__init__() 13 | 14 | for _ in range(n_layer-1): 15 | self.add_link(L.Linear(None, n_units)) 16 | self.add_link(L.Linear(None, n_out)) 17 | self.activations = [activation] * (n_layer - 1) + [out_activation] 18 | 19 | if hook: 20 | hook_params = dict() if hook_params is None else hook_params 21 | for link in self.children(): 22 | link.add_hook(hook(**hook_params)) 23 | 24 | def forward(self, x): 25 | for link, act in zip(self.children(), self.activations): 26 | x = act(link(x)) 27 | return x 28 | 29 | def __call__(self, x): 30 | return self.forward(x) -------------------------------------------------------------------------------- /irl/common/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from irl.common.utils.get_states_actions_next_states import get_states_actions_next_states -------------------------------------------------------------------------------- /irl/common/utils/get_states_actions_next_states.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | from itertools import chain 4 | 5 | 6 | def get_states_actions_next_states(states, actions, xp=np): 7 | # Prepare demonstrations 8 | # deep copy is necessary because demo_states can be list of lists 9 | next_states = copy.deepcopy(states) 10 | 11 | if states.ndim > 2: 12 | # demo_states.shape = (n_episode, n_steps, *observation.shape) 13 | # if each episode in demo has same length, demo_states and demo_actions will be numpy array 14 | # delete last state and action because there is no next state of last state 15 | states = states[:, :-1, ...] 16 | actions = actions[:, :-1, ...] 17 | # delete first state to make demo_next_states[:, i, ...] be the next states of the demo_states[:, i, ...] 18 | next_states = next_states[:, 1:, ...] 19 | else: 20 | # if length of episodes are different, delete last state, action, and first state 21 | for demo_states_epi, demo_action_epi, demo_next_state_epi \ 22 | in zip(states, actions, next_states): 23 | # delete last state and action because there is no next state of last state 24 | del demo_states_epi[-1] 25 | del demo_action_epi[-1] 26 | # delete first state to make demo_next_states_epi[i] be the next states of the demo_states_epi[i] 27 | del demo_next_state_epi[0] 28 | 29 | states = xp.asarray(np.array(list(chain(*states))).astype(dtype=np.float32)) 30 | next_states = xp.asarray(np.array(list(chain(*next_states))).astype(dtype=np.float32)) 31 | actions = xp.asarray(np.array(list(chain(*actions))).astype(dtype=np.float32)) 32 | return states, actions, next_states -------------------------------------------------------------------------------- /irl/common/utils/mean_or_nan.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def mean_or_nan(xs): 5 | """Return its mean a non-empty sequence, numpy.nan for a empty one.""" 6 | return np.mean(xs) if xs else np.nan -------------------------------------------------------------------------------- /irl/gail/__init__.py: -------------------------------------------------------------------------------- 1 | from irl.gail.discriminator import Discriminator 2 | from irl.gail.gail import GAIL 3 | -------------------------------------------------------------------------------- /irl/gail/discriminator.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | from irl.common.model import MLP 4 | 5 | 6 | class Discriminator: 7 | def __init__(self, n_layer=3, n_units=64, loss_type='wgangp', gpu=-1): 8 | self.model = MLP(n_layer, n_units, 1) 9 | 10 | if gpu >= 0: 11 | self.model.to_gpu(gpu) 12 | 13 | self.optimizer = chainer.optimizers.Adam(alpha=1e-5, eps=1e-5) 14 | self.optimizer.setup(self.model) 15 | self.loss_type = loss_type 16 | self.loss = None 17 | 18 | def __call__(self, x): 19 | return F.sigmoid(self.model(x)) 20 | 21 | def train(self, expert_data, fake_data): 22 | self.model.cleargrads() 23 | 24 | if self.loss_type == 'gan': 25 | d_expert = self.model(expert_data) 26 | d_fake = self.model(fake_data) 27 | # discriminator is trained to predict a p(expert|x) 28 | self.loss = F.mean(F.softplus(-d_expert)) 29 | self.loss += F.mean(F.softplus(d_fake)) 30 | elif self.loss_type == 'wgangp': 31 | # sampling along straight lines 32 | xp = chainer.cuda.get_array_module(expert_data) 33 | e = xp.random.uniform(0., 1., len(expert_data))[:, None].astype(xp.float32) 34 | x_hat = chainer.Variable((e * expert_data + (1 - e) * fake_data).array, requires_grad=True) 35 | grad, = chainer.grad([self.model(x_hat)], [x_hat], enable_double_backprop=True) 36 | grad = F.sqrt(F.batch_l2_norm_squared(grad)) 37 | 38 | loss_grad = 1 * F.mean_squared_error(grad, xp.ones_like(grad.data)) 39 | loss_gan = F.mean(self.model(fake_data) - self.model(expert_data)) 40 | # discriminator is trained to predict a p(expert|x) 41 | self.loss = loss_gan + loss_grad 42 | else: 43 | raise NotImplementedError 44 | 45 | self.loss.backward() 46 | self.optimizer.update() 47 | 48 | return self.loss 49 | 50 | def get_rewards(self, x): 51 | # - log p(fake|x) == - (log 1 - p(expert|x)) is more stable than log(1 - p(fake|x)) and log(p(expert|x)) 52 | if self.loss_type == 'gan': 53 | return - F.log(1 - self(x)) 54 | return self.model(x) 55 | -------------------------------------------------------------------------------- /irl/gail/gail.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import numpy as np 3 | import chainer.functions as F 4 | from chainerrl.agents import PPO, TRPO 5 | from chainerrl.policies import SoftmaxPolicy 6 | from itertools import chain 7 | 8 | 9 | class GAIL(PPO): 10 | def __init__(self, discriminator, demonstrations, **kwargs): 11 | # super take arguments for dynamic inheritance 12 | super(self.__class__, self).__init__(**kwargs) 13 | self.discriminator = discriminator 14 | self.demo_states = self.xp.asarray(np.asarray(list(chain(*demonstrations['states']))).astype(np.float32)) 15 | self.demo_actions = self.xp.asarray(np.asarray(list(chain(*demonstrations['actions']))).astype(np.float32)) 16 | 17 | def _update(self, dataset): 18 | # override func 19 | if self.obs_normalizer: 20 | self._update_obs_normalizer(dataset) 21 | xp = self.xp 22 | 23 | dataset_iter = chainer.iterators.SerialIterator( 24 | dataset, self.minibatch_size, shuffle=True) 25 | loss_mean = 0 26 | while dataset_iter.epoch < self.epochs: 27 | batch = dataset_iter.__next__() 28 | states = self.batch_states([b['state'] for b in batch], xp, self.phi) 29 | actions = xp.array([b['action'] for b in batch]) 30 | 31 | demonstrations_indexes = np.random.permutation(len(self.demo_states))[:len(states)] 32 | demo_states, demo_actions = [d[demonstrations_indexes] for d in (self.demo_states, self.demo_actions)] 33 | 34 | if self.obs_normalizer: 35 | states = self.obs_normalizer(states, update=False) 36 | demo_states = self.obs_normalizer(demo_states, update=False) 37 | self.discriminator.train(self.convert_data_to_feed_discriminator(demo_states, demo_actions), 38 | self.convert_data_to_feed_discriminator(states, actions)) 39 | loss_mean += self.discriminator.loss / (self.epochs * self.minibatch_size) 40 | super(self.__class__, self)._update(dataset) 41 | 42 | def _update_if_dataset_is_ready(self): 43 | # override func 44 | dataset_size = ( 45 | sum(len(episode) for episode in self.memory) 46 | + len(self.last_episode) 47 | + (0 if self.batch_last_episode is None else 48 | sum(len(episode) for episode in self.batch_last_episode))) 49 | if dataset_size >= self.update_interval: 50 | # update reward in self.memory 51 | self._flush_last_episode() 52 | transitions = list(chain.from_iterable(self.memory)) 53 | states = self.xp.asarray(np.concatenate([transition['state'][None] for transition in transitions])) 54 | actions = self.xp.asarray(np.concatenate([transition['action'][None] for transition in transitions])) 55 | with chainer.configuration.using_config('train', False), chainer.no_backprop_mode(): 56 | rewards = self.discriminator.get_rewards(self.convert_data_to_feed_discriminator(states, actions)).array 57 | i = 0 58 | for episode in self.memory: 59 | for transition in episode: 60 | transition['reward'] = float(rewards[i]) 61 | i += 1 62 | dataset = self._make_dataset() 63 | assert len(dataset) == dataset_size 64 | self._update(dataset) 65 | self.memory = [] 66 | 67 | def convert_data_to_feed_discriminator(self, states, actions, noise_scale=0.1): 68 | xp = self.model.xp 69 | if isinstance(self.model.pi, SoftmaxPolicy): 70 | # if discrete action 71 | actions = xp.eye(self.model.pi.model.out_size, dtype=xp.float32)[actions.astype(xp.int32)] 72 | if noise_scale: 73 | actions += xp.random.normal(loc=0., scale=noise_scale, size=actions.shape) 74 | return F.concat((xp.array(states), xp.array(actions))) 75 | 76 | 77 | def gailtype_constructor(rl_algo=TRPO): 78 | _gail_parent = GAIL.mro()[1] 79 | _gail_func_dict = {func: getattr(GAIL, func) for func in dir(GAIL) if callable(getattr(GAIL, func)) 80 | and (not func.startswith("__") or func == '__init__') 81 | and (not hasattr(_gail_parent, func) 82 | or not getattr(GAIL, func) == getattr(_gail_parent, func))} 83 | return type("GAIL" + rl_algo.__name__.upper(), (rl_algo,), _gail_func_dict) 84 | 85 | 86 | # GAILTRPO do not work because TRPO's interface is not compatible with PPO 87 | GAILTRPO = gailtype_constructor(rl_algo=TRPO) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.16.1 2 | chainer==6.0.0b3 3 | gym==0.10.5 4 | chainerrl==0.6.0 5 | -------------------------------------------------------------------------------- /train_gym.py: -------------------------------------------------------------------------------- 1 | """An example of training PPO against OpenAI Gym Envs. 2 | 3 | This script is an example of training a PPO agent against OpenAI Gym envs. 4 | Both discrete and continuous action spaces are supported. 5 | 6 | To solve CartPole-v0, run: 7 | python train_ppo_gym.py --env CartPole-v0 8 | """ 9 | import argparse 10 | 11 | import chainer 12 | from chainer import functions as F 13 | import gym 14 | import gym.wrappers 15 | 16 | import chainerrl 17 | from chainerrl.agents import a3c 18 | from chainerrl.agents import PPO 19 | from chainerrl import experiments 20 | from chainerrl import links 21 | from chainerrl import misc 22 | from chainerrl.optimizers.nonbias_weight_decay import NonbiasWeightDecay 23 | from chainerrl import policies 24 | 25 | 26 | class A3CFFSoftmax(chainer.ChainList, a3c.A3CModel): 27 | """An example of A3C feedforward softmax policy.""" 28 | 29 | def __init__(self, ndim_obs, n_actions, hidden_sizes=(200, 200)): 30 | self.pi = policies.SoftmaxPolicy( 31 | model=links.MLP(ndim_obs, n_actions, hidden_sizes)) 32 | self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes) 33 | super().__init__(self.pi, self.v) 34 | 35 | def pi_and_v(self, state): 36 | return self.pi(state), self.v(state) 37 | 38 | 39 | class A3CFFMellowmax(chainer.ChainList, a3c.A3CModel): 40 | """An example of A3C feedforward mellowmax policy.""" 41 | 42 | def __init__(self, ndim_obs, n_actions, hidden_sizes=(200, 200)): 43 | self.pi = policies.MellowmaxPolicy( 44 | model=links.MLP(ndim_obs, n_actions, hidden_sizes)) 45 | self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes) 46 | super().__init__(self.pi, self.v) 47 | 48 | def pi_and_v(self, state): 49 | return self.pi(state), self.v(state) 50 | 51 | 52 | class A3CFFGaussian(chainer.Chain, a3c.A3CModel): 53 | """An example of A3C feedforward Gaussian policy.""" 54 | 55 | def __init__(self, obs_size, action_space, 56 | n_hidden_layers=2, n_hidden_channels=64, 57 | bound_mean=None): 58 | assert bound_mean in [False, True] 59 | super().__init__() 60 | hidden_sizes = (n_hidden_channels,) * n_hidden_layers 61 | with self.init_scope(): 62 | self.pi = policies.FCGaussianPolicyWithStateIndependentCovariance( 63 | obs_size, action_space.low.size, 64 | n_hidden_layers, n_hidden_channels, 65 | var_type='diagonal', nonlinearity=F.tanh, 66 | bound_mean=bound_mean, 67 | min_action=action_space.low, max_action=action_space.high, 68 | mean_wscale=1e-2) 69 | self.v = links.MLP(obs_size, 1, hidden_sizes=hidden_sizes) 70 | 71 | def pi_and_v(self, state): 72 | return self.pi(state), self.v(state) 73 | 74 | 75 | def save_agent_demo(env, agent, out_dir, max_t=2000): 76 | import numpy as np 77 | r, t = 0, 0 78 | agent_observations = [] 79 | agent_actions = [] 80 | while t < max_t: 81 | agent_observations.append([]) 82 | agent_actions.append([]) 83 | obs = env.reset() 84 | while True: 85 | act = agent.act(obs) 86 | agent_observations[-1].append(obs) 87 | agent_actions[-1].append(act) 88 | obs, reward, done, _ = env.step(act) 89 | t += 1 90 | r += reward 91 | if done or t >= max_t: 92 | print(t) 93 | break 94 | 95 | # save numpy array consists of lists 96 | np.savez(out_dir+'/trajectories.npz', states=np.array(agent_observations, dtype=object), 97 | actions=np.array(agent_actions, dtype=object)) 98 | 99 | 100 | def main(): 101 | import logging 102 | 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument('algo', default='ppo', choices=['ppo', 'gail', 'airl'], type=str) 105 | parser.add_argument('--gpu', type=int, default=0) 106 | parser.add_argument('--env', type=str, default='Hopper-v2') 107 | parser.add_argument('--arch', type=str, default='FFGaussian', 108 | choices=('FFSoftmax', 'FFMellowmax', 109 | 'FFGaussian')) 110 | parser.add_argument('--bound-mean', action='store_true') 111 | parser.add_argument('--seed', type=int, default=0, 112 | help='Random seed [0, 2 ** 32)') 113 | parser.add_argument('--outdir', type=str, default='results', 114 | help='Directory path to save output files.' 115 | ' If it does not exist, it will be created.') 116 | parser.add_argument('--steps', type=int, default=10 ** 6) 117 | parser.add_argument('--eval-interval', type=int, default=10000) 118 | parser.add_argument('--eval-n-runs', type=int, default=10) 119 | parser.add_argument('--reward-scale-factor', type=float, default=1e-2) 120 | parser.add_argument('--standardize-advantages', action='store_true') 121 | parser.add_argument('--render', action='store_true', default=False) 122 | parser.add_argument('--lr', type=float, default=3e-4) 123 | parser.add_argument('--weight-decay', type=float, default=0.0) 124 | parser.add_argument('--demo', action='store_true', default=False) 125 | parser.add_argument('--load', type=str, default='') 126 | parser.add_argument('--load_demo', type=str, default='') 127 | parser.add_argument('--logger-level', type=int, default=logging.DEBUG) 128 | parser.add_argument('--monitor', action='store_true') 129 | 130 | parser.add_argument('--update-interval', type=int, default=2048) 131 | parser.add_argument('--batchsize', type=int, default=64) 132 | parser.add_argument('--epochs', type=int, default=10) 133 | parser.add_argument('--entropy-coef', type=float, default=0.0) 134 | args = parser.parse_args() 135 | 136 | logging.basicConfig(level=args.logger_level) 137 | 138 | # Set a random seed used in ChainerRL 139 | misc.set_random_seed(args.seed, gpus=(args.gpu,)) 140 | 141 | if not (args.demo and args.load): 142 | args.outdir = experiments.prepare_output_dir(args, args.outdir) 143 | 144 | def make_env(test): 145 | env = gym.make(args.env) 146 | # Use different random seeds for train and test envs 147 | env_seed = 2 ** 32 - 1 - args.seed if test else args.seed 148 | env.seed(env_seed) 149 | # Cast observations to float32 because our model uses float32 150 | env = chainerrl.wrappers.CastObservationToFloat32(env) 151 | if args.monitor: 152 | env = gym.wrappers.Monitor(env, args.outdir) 153 | if not test: 154 | # Scale rewards (and thus returns) to a reasonable range so that 155 | # training is easier 156 | env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) 157 | if args.render: 158 | env = chainerrl.wrappers.Render(env) 159 | return env 160 | 161 | sample_env = gym.make(args.env) 162 | timestep_limit = sample_env.spec.tags.get( 163 | 'wrapper_config.TimeLimit.max_episode_steps') 164 | obs_space = sample_env.observation_space 165 | action_space = sample_env.action_space 166 | 167 | # Normalize observations based on their empirical mean and variance 168 | obs_normalizer = chainerrl.links.EmpiricalNormalization( 169 | obs_space.low.size, clip_threshold=5) 170 | 171 | # Switch policy types accordingly to action space types 172 | if args.arch == 'FFSoftmax': 173 | model = A3CFFSoftmax(obs_space.low.size, action_space.n) 174 | elif args.arch == 'FFMellowmax': 175 | model = A3CFFMellowmax(obs_space.low.size, action_space.n) 176 | elif args.arch == 'FFGaussian': 177 | model = A3CFFGaussian(obs_space.low.size, action_space, 178 | bound_mean=args.bound_mean) 179 | 180 | opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) 181 | opt.setup(model) 182 | if args.weight_decay > 0: 183 | opt.add_hook(NonbiasWeightDecay(args.weight_decay)) 184 | if args.algo == 'ppo': 185 | agent = PPO(model, opt, 186 | obs_normalizer=obs_normalizer, 187 | gpu=args.gpu, 188 | update_interval=args.update_interval, 189 | minibatch_size=args.batchsize, epochs=args.epochs, 190 | clip_eps_vf=None, entropy_coef=args.entropy_coef, 191 | standardize_advantages=args.standardize_advantages, 192 | ) 193 | elif args.algo == 'gail': 194 | import numpy as np 195 | from irl.gail import GAIL 196 | from irl.gail import Discriminator 197 | demonstrations = np.load(args.load_demo) 198 | D = Discriminator(gpu=args.gpu) 199 | agent = GAIL(demonstrations=demonstrations, discriminator=D, 200 | model=model, optimizer=opt, 201 | obs_normalizer=obs_normalizer, 202 | gpu=args.gpu, 203 | update_interval=args.update_interval, 204 | minibatch_size=args.batchsize, epochs=args.epochs, 205 | clip_eps_vf=None, entropy_coef=args.entropy_coef, 206 | standardize_advantages=args.standardize_advantages,) 207 | elif args.algo == 'airl': 208 | import numpy as np 209 | from irl.airl import AIRL as Agent 210 | from irl.airl import Discriminator 211 | # obs_normalizer = None 212 | demonstrations = np.load(args.load_demo) 213 | D = Discriminator(gpu=args.gpu) 214 | agent = Agent(demonstrations=demonstrations, discriminator=D, 215 | model=model, optimizer=opt, 216 | obs_normalizer=obs_normalizer, 217 | gpu=args.gpu, 218 | update_interval=args.update_interval, 219 | minibatch_size=args.batchsize, epochs=args.epochs, 220 | clip_eps_vf=None, entropy_coef=args.entropy_coef, 221 | standardize_advantages=args.standardize_advantages,) 222 | 223 | if args.load: 224 | agent.load(args.load) 225 | 226 | if args.demo: 227 | env = make_env(True) 228 | eval_stats = experiments.eval_performance( 229 | env=env, 230 | agent=agent, 231 | n_steps=None, 232 | n_episodes=args.eval_n_runs, 233 | max_episode_len=timestep_limit) 234 | print('n_runs: {} mean: {} median: {} stdev {}'.format( 235 | args.eval_n_runs, eval_stats['mean'], eval_stats['median'], 236 | eval_stats['stdev'])) 237 | outdir = args.load if args.load else args.outdir 238 | save_agent_demo(make_env(False), agent, outdir) 239 | else: 240 | # Linearly decay the learning rate to zero 241 | def lr_setter(env, agent, value): 242 | agent.optimizer.alpha = value 243 | 244 | lr_decay_hook = experiments.LinearInterpolationHook( 245 | args.steps, args.lr, 0, lr_setter) 246 | 247 | # Linearly decay the clipping parameter to zero 248 | def clip_eps_setter(env, agent, value): 249 | agent.clip_eps = max(value, 1e-8) 250 | 251 | clip_eps_decay_hook = experiments.LinearInterpolationHook( 252 | args.steps, 0.2, 0, clip_eps_setter) 253 | 254 | experiments.train_agent_with_evaluation( 255 | agent=agent, 256 | env=make_env(False), 257 | eval_env=make_env(True), 258 | outdir=args.outdir, 259 | steps=args.steps, 260 | eval_n_steps=None, 261 | eval_n_episodes=args.eval_n_runs, 262 | eval_interval=args.eval_interval, 263 | train_max_episode_len=timestep_limit, 264 | save_best_so_far_agent=False, 265 | step_hooks=[ 266 | lr_decay_hook, 267 | clip_eps_decay_hook, 268 | ], 269 | ) 270 | save_agent_demo(make_env(False), agent, args.outdir) 271 | 272 | 273 | if __name__ == '__main__': 274 | main() 275 | --------------------------------------------------------------------------------