├── .gitignore
├── LICENSE
├── README.md
├── irl
    ├── __init__.py
    ├── airl
    │   ├── __init__.py
    │   ├── airl.py
    │   └── discriminator.py
    ├── common
    │   ├── __init__.py
    │   ├── model.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── get_states_actions_next_states.py
    │   │   └── mean_or_nan.py
    └── gail
    │   ├── __init__.py
    │   ├── discriminator.py
    │   └── gail.py
├── requirements.txt
└── train_gym.py


/.gitignore:
--------------------------------------------------------------------------------
1 | results/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Yusuke Nakata
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Inverse Reinforcement Learning
 2 | 
 3 | ## Contents
 4 | 
 5 | Chainer implementation of Adversarial Inverse Reinforcement Learning (AIRL) and Generative Adversarial Imitation Learning (GAIL). 
 6 | The code heavily depend on the reinforcement learning package [Chainerrl](https://github.com/chainer/chainerrl).
 7 | 
 8 | ## Commands
 9 | 
10 | Train and sample expert trajectory
11 | ```bash
12 | python train_gym.py ppo --gpu $gpu_id --env CartPole-v0 --arch FFSoftmax --steps 50000 
13 | ```
14 | 
15 | Run GAIL
16 | ```bash
17 | python train_gym.py gail --gpu $gpu_id --env CartPole-v0 --arch FFSoftmax --steps 100000 \
18 |                     --load_demo ${PathOfDemonstrationNpzFile} --update-interval 128 --entropy-coef 0.01
19 | ```
20 | 
21 | Run AIRL
22 | ```bash
23 | python train_gym.py airl --gpu $gpu_id --env CartPole-v0 --arch FFSoftmax --steps 100000 \
24 |                     --load_demo ${PathOfDemonstrationNpzFile} --update-interval 128 --entropy-coef 0.01
25 | ```
26 | 
27 | ## LICENSE
28 | MIT
29 | 


--------------------------------------------------------------------------------
/irl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uidilr/deepirl_chainer/45f6134fe457bdae1484e4847ab0701f39940faa/irl/__init__.py


--------------------------------------------------------------------------------
/irl/airl/__init__.py:
--------------------------------------------------------------------------------
1 | from irl.airl.airl import AIRL
2 | from irl.airl.discriminator import Discriminator


--------------------------------------------------------------------------------
/irl/airl/airl.py:
--------------------------------------------------------------------------------
  1 | import chainer
  2 | import numpy as np
  3 | from chainerrl.agents import PPO
  4 | from itertools import chain
  5 | import collections
  6 | from chainerrl.policies import SoftmaxPolicy
  7 | 
  8 | from irl.airl.discriminator import Discriminator
  9 | from irl.common.utils.mean_or_nan import mean_or_nan
 10 | from irl.common.utils import get_states_actions_next_states
 11 | 
 12 | 
 13 | 
 14 | class AIRL(PPO):
 15 |     def __init__(self, discriminator: Discriminator, demonstrations, discriminator_loss_stats_window=1000, **kwargs):
 16 | 
 17 |         super().__init__(**kwargs)
 18 |         self.discriminator = discriminator
 19 | 
 20 |         self.demo_states, self.demo_actions, self.demo_next_states = \
 21 |             get_states_actions_next_states(demonstrations['states'], demonstrations['actions'], xp=self.xp)
 22 |         if isinstance(self.model.pi, SoftmaxPolicy):
 23 |             # action space is continuous
 24 |             self.demo_actions = self.demo_actions.astype(dtype=self.xp.int32)
 25 | 
 26 |         self.discriminator_loss_record = collections.deque(maxlen=discriminator_loss_stats_window)
 27 |         self.reward_mean_record = collections.deque(maxlen=discriminator_loss_stats_window)
 28 | 
 29 |     def _update(self, dataset):
 30 |         # override func
 31 |         xp = self.xp
 32 | 
 33 |         if self.obs_normalizer:
 34 |             self._update_obs_normalizer(dataset)
 35 | 
 36 |         dataset_iter = chainer.iterators.SerialIterator(dataset, self.minibatch_size, shuffle=True)
 37 |         loss_mean = 0
 38 |         while dataset_iter.epoch < self.epochs:
 39 |             # create batch for this iter
 40 |             batch = dataset_iter.__next__()
 41 |             states = self.batch_states([b['state'] for b in batch], xp, self.phi)
 42 |             next_states = self.batch_states([b['next_state'] for b in batch], xp, self.phi)
 43 |             actions = xp.array([b['action'] for b in batch])
 44 | 
 45 |             # create batch of expert data for this iter
 46 |             demonstrations_indexes = np.random.permutation(len(self.demo_states))[:self.minibatch_size]
 47 |             demo_states, demo_actions, demo_next_states = [d[demonstrations_indexes]
 48 |                                                            for d in (self.demo_states, self.demo_actions,
 49 |                                                                      self.demo_next_states)]
 50 | 
 51 |             states, demo_states, next_states, demo_next_states = [(self.obs_normalizer(d, update=False)
 52 |                                                                   if self.obs_normalizer else d)
 53 |                                                                   for d in [states, demo_states,
 54 |                                                                             next_states, demo_next_states]]
 55 | 
 56 |             with chainer.configuration.using_config('train', False), chainer.no_backprop_mode():
 57 |                 action_log_probs = self.get_probs(states, actions)
 58 |                 demo_action_log_probs = self.get_probs(demo_states, demo_actions)
 59 | 
 60 |             loss = self.discriminator.train(expert_states=demo_states, expert_next_states=demo_next_states,
 61 |                                             expert_action_probs=demo_action_log_probs, fake_states=states,
 62 |                                             fake_next_states=next_states, fake_action_probs=action_log_probs,
 63 |                                             gamma=self.gamma)
 64 |             loss_mean += loss / (self.epochs * self.minibatch_size)
 65 |         self.discriminator_loss_record.append(float(loss_mean.array))
 66 |         super()._update(dataset)
 67 | 
 68 |     def _update_if_dataset_is_ready(self):
 69 |         # override func
 70 |         dataset_size = (
 71 |             sum(len(episode) for episode in self.memory)
 72 |             + len(self.last_episode)
 73 |             + (0 if self.batch_last_episode is None else sum(
 74 |                 len(episode) for episode in self.batch_last_episode)))
 75 |         if dataset_size >= self.update_interval:
 76 |             self._flush_last_episode()
 77 | 
 78 |             # update reward in self.memory
 79 |             transitions = list(chain(*self.memory))
 80 |             with chainer.configuration.using_config('train', False), chainer.no_backprop_mode():
 81 |                 rewards = self.discriminator.get_rewards(self.xp.asarray(np.concatenate([transition['state'][None]
 82 |                                                                          for transition in transitions]))).array
 83 |             self.reward_mean_record.append(float(np.mean(rewards)))
 84 |             i = 0
 85 |             for episode in self.memory:
 86 |                 for transition in episode:
 87 |                     transition['reward'] = float(rewards[i])
 88 |                     i += 1
 89 |             assert self.memory[0][0]['reward'] == float(rewards[0]), 'rewards is not replaced.'
 90 | 
 91 |             dataset = self._make_dataset()
 92 |             assert len(dataset) == dataset_size
 93 |             self._update(dataset)
 94 |             self.memory = []
 95 | 
 96 |     def get_probs(self, states, actions):
 97 |         target_distribs, _ = self.model(states)
 98 |         return target_distribs.log_prob(actions)
 99 | 
100 |     def get_statistics(self):
101 |         return [('average_discriminator_loss', mean_or_nan(self.discriminator_loss_record)),
102 |                 ('average_rewards', mean_or_nan(self.reward_mean_record))] + super().get_statistics()
103 | 
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/irl/airl/discriminator.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | from irl.common.model import MLP
 4 | from chainer.link_hooks.spectral_normalization import SpectralNormalization
 5 | 
 6 | 
 7 | class Discriminator:
 8 |     def __init__(self, n_layer=4, n_units=32, gpu=-1):
 9 |         self.reward_net = MLP(n_layer, n_units, 1, hook=SpectralNormalization, hook_params=dict(factor=1))
10 |         self.value_net = MLP(n_layer, n_units, 1)  # , hook=SpectralNormalization, hook_params=dict(factor=10))
11 |         if gpu >= 0:
12 |             self.reward_net.to_gpu(gpu)
13 |             self.value_net.to_gpu(gpu)
14 |         # adding spectral normalization with small factor for value net makes training unstable
15 |         # but why adversarial loss decreases when we add this to the value net?
16 |         # because the lipschitz factor can be bounded by the lipschitz constant of the reward net?
17 |         self.reward_optimizer = chainer.optimizers.Adam()
18 |         self.value_optimizer = chainer.optimizers.Adam()
19 |         self.reward_optimizer.setup(self.reward_net)
20 |         self.value_optimizer.setup(self.value_net)
21 | 
22 |     def __call__(self, x):
23 |         return self.reward_net(x), self.value_net(x)
24 | 
25 |     def train(self, expert_states, expert_next_states, expert_action_probs, fake_states, fake_next_states,
26 |               fake_action_probs, gamma):
27 | 
28 |         def logits(states, next_states, log_action_probs):
29 |             # p(expert|state, action) = sigmoid(logits)
30 |             rewards = self.reward_net(states)
31 |             # print(F.mean(rewards))
32 |             state_values = self.value_net(states)
33 |             next_state_values = self.value_net(next_states)
34 |             return rewards + gamma * next_state_values - state_values - log_action_probs[:, None].array
35 | 
36 |         # This parameter stabilise training
37 |         # softplus(logits) == log(sigmoid(logits))
38 |         # print('expert: ', end='')
39 |         loss = F.mean(F.softplus(-logits(expert_states, expert_next_states, expert_action_probs)))
40 |         # print('fake: ', end='')
41 |         loss += F.mean(F.softplus(logits(fake_states, fake_next_states, fake_action_probs)))
42 | 
43 |         # add gradient penalty for reward
44 |         # xp = chainer.cuda.get_array_module(expert_states)
45 |         # e = xp.random.uniform(0., 1., len(expert_states))[:, None].astype(xp.float32)
46 |         # x_hat = chainer.Variable((e * expert_states + (1 - e) * fake_states), requires_grad=True)
47 |         # grad, = chainer.grad([self.reward_net(x_hat)], [x_hat], enable_double_backprop=True)
48 |         # loss_grad = 0.1 * F.mean(F.sqrt(F.batch_l2_norm_squared(grad)))
49 |         # loss += loss_grad
50 | 
51 |         self.reward_net.cleargrads()
52 |         self.value_net.cleargrads()
53 |         loss.backward()
54 |         self.reward_optimizer.update()
55 |         self.value_optimizer.update()
56 |         return loss
57 | 
58 |     def get_rewards(self, x):
59 |         return self.reward_net(x)
60 | 


--------------------------------------------------------------------------------
/irl/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uidilr/deepirl_chainer/45f6134fe457bdae1484e4847ab0701f39940faa/irl/common/__init__.py


--------------------------------------------------------------------------------
/irl/common/model.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | import chainer.links as L
 4 | 
 5 | 
 6 | def pass_fn(x):
 7 |     return x
 8 | 
 9 | 
10 | class MLP(chainer.ChainList):
11 |     def __init__(self, n_layer, n_units, n_out, activation=F.leaky_relu, out_activation=pass_fn, hook=None, hook_params=None):
12 |         super().__init__()
13 | 
14 |         for _ in range(n_layer-1):
15 |             self.add_link(L.Linear(None, n_units))
16 |         self.add_link(L.Linear(None, n_out))
17 |         self.activations = [activation] * (n_layer - 1) + [out_activation]
18 | 
19 |         if hook:
20 |             hook_params = dict() if hook_params is None else hook_params
21 |             for link in self.children():
22 |                 link.add_hook(hook(**hook_params))
23 | 
24 |     def forward(self, x):
25 |         for link, act in zip(self.children(), self.activations):
26 |             x = act(link(x))
27 |         return x
28 | 
29 |     def __call__(self, x):
30 |         return self.forward(x)


--------------------------------------------------------------------------------
/irl/common/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from irl.common.utils.get_states_actions_next_states import get_states_actions_next_states


--------------------------------------------------------------------------------
/irl/common/utils/get_states_actions_next_states.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | from itertools import chain
 4 | 
 5 | 
 6 | def get_states_actions_next_states(states, actions, xp=np):
 7 |     # Prepare demonstrations
 8 |     # deep copy is necessary because demo_states can be list of lists
 9 |     next_states = copy.deepcopy(states)
10 | 
11 |     if states.ndim > 2:
12 |         # demo_states.shape = (n_episode, n_steps, *observation.shape)
13 |         # if each episode in demo has same length, demo_states and demo_actions will be numpy array
14 |         # delete last state and action because there is no next state of last state
15 |         states = states[:, :-1, ...]
16 |         actions = actions[:, :-1, ...]
17 |         # delete first state to make demo_next_states[:, i, ...] be the next states of the demo_states[:, i, ...]
18 |         next_states = next_states[:, 1:, ...]
19 |     else:
20 |         # if length of episodes are different, delete last state, action, and first state
21 |         for demo_states_epi, demo_action_epi, demo_next_state_epi \
22 |                 in zip(states, actions, next_states):
23 |             # delete last state and action because there is no next state of last state
24 |             del demo_states_epi[-1]
25 |             del demo_action_epi[-1]
26 |             # delete first state to make demo_next_states_epi[i] be the next states of the demo_states_epi[i]
27 |             del demo_next_state_epi[0]
28 | 
29 |     states = xp.asarray(np.array(list(chain(*states))).astype(dtype=np.float32))
30 |     next_states = xp.asarray(np.array(list(chain(*next_states))).astype(dtype=np.float32))
31 |     actions = xp.asarray(np.array(list(chain(*actions))).astype(dtype=np.float32))
32 |     return states, actions, next_states


--------------------------------------------------------------------------------
/irl/common/utils/mean_or_nan.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | 
4 | def mean_or_nan(xs):
5 |     """Return its mean a non-empty sequence, numpy.nan for a empty one."""
6 |     return np.mean(xs) if xs else np.nan


--------------------------------------------------------------------------------
/irl/gail/__init__.py:
--------------------------------------------------------------------------------
1 | from irl.gail.discriminator import Discriminator
2 | from irl.gail.gail import GAIL
3 | 


--------------------------------------------------------------------------------
/irl/gail/discriminator.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | from irl.common.model import MLP
 4 | 
 5 | 
 6 | class Discriminator:
 7 |     def __init__(self, n_layer=3, n_units=64, loss_type='wgangp', gpu=-1):
 8 |         self.model = MLP(n_layer, n_units, 1)
 9 | 
10 |         if gpu >= 0:
11 |             self.model.to_gpu(gpu)
12 | 
13 |         self.optimizer = chainer.optimizers.Adam(alpha=1e-5, eps=1e-5)
14 |         self.optimizer.setup(self.model)
15 |         self.loss_type = loss_type
16 |         self.loss = None
17 | 
18 |     def __call__(self, x):
19 |         return F.sigmoid(self.model(x))
20 | 
21 |     def train(self, expert_data, fake_data):
22 |         self.model.cleargrads()
23 | 
24 |         if self.loss_type == 'gan':
25 |             d_expert = self.model(expert_data)
26 |             d_fake = self.model(fake_data)
27 |             # discriminator is trained to predict a p(expert|x)
28 |             self.loss = F.mean(F.softplus(-d_expert))
29 |             self.loss += F.mean(F.softplus(d_fake))
30 |         elif self.loss_type == 'wgangp':
31 |             # sampling along straight lines
32 |             xp = chainer.cuda.get_array_module(expert_data)
33 |             e = xp.random.uniform(0., 1., len(expert_data))[:, None].astype(xp.float32)
34 |             x_hat = chainer.Variable((e * expert_data + (1 - e) * fake_data).array, requires_grad=True)
35 |             grad, = chainer.grad([self.model(x_hat)], [x_hat], enable_double_backprop=True)
36 |             grad = F.sqrt(F.batch_l2_norm_squared(grad))
37 | 
38 |             loss_grad = 1 * F.mean_squared_error(grad, xp.ones_like(grad.data))
39 |             loss_gan = F.mean(self.model(fake_data) - self.model(expert_data))
40 |             # discriminator is trained to predict a p(expert|x)
41 |             self.loss = loss_gan + loss_grad
42 |         else:
43 |             raise NotImplementedError
44 | 
45 |         self.loss.backward()
46 |         self.optimizer.update()
47 | 
48 |         return self.loss
49 | 
50 |     def get_rewards(self, x):
51 |         # - log p(fake|x) == - (log 1 - p(expert|x)) is more stable than log(1 - p(fake|x)) and log(p(expert|x))
52 |         if self.loss_type == 'gan':
53 |             return - F.log(1 - self(x))
54 |         return self.model(x)
55 | 


--------------------------------------------------------------------------------
/irl/gail/gail.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import numpy as np
 3 | import chainer.functions as F
 4 | from chainerrl.agents import PPO, TRPO
 5 | from chainerrl.policies import SoftmaxPolicy
 6 | from itertools import chain
 7 | 
 8 | 
 9 | class GAIL(PPO):
10 |     def __init__(self, discriminator, demonstrations, **kwargs):
11 |         # super take arguments for dynamic inheritance
12 |         super(self.__class__, self).__init__(**kwargs)
13 |         self.discriminator = discriminator
14 |         self.demo_states = self.xp.asarray(np.asarray(list(chain(*demonstrations['states']))).astype(np.float32))
15 |         self.demo_actions = self.xp.asarray(np.asarray(list(chain(*demonstrations['actions']))).astype(np.float32))
16 | 
17 |     def _update(self, dataset):
18 |         # override func
19 |         if self.obs_normalizer:
20 |             self._update_obs_normalizer(dataset)
21 |         xp = self.xp
22 | 
23 |         dataset_iter = chainer.iterators.SerialIterator(
24 |             dataset, self.minibatch_size, shuffle=True)
25 |         loss_mean = 0
26 |         while dataset_iter.epoch < self.epochs:
27 |             batch = dataset_iter.__next__()
28 |             states = self.batch_states([b['state'] for b in batch], xp, self.phi)
29 |             actions = xp.array([b['action'] for b in batch])
30 | 
31 |             demonstrations_indexes = np.random.permutation(len(self.demo_states))[:len(states)]
32 |             demo_states, demo_actions = [d[demonstrations_indexes] for d in (self.demo_states, self.demo_actions)]
33 | 
34 |             if self.obs_normalizer:
35 |                 states = self.obs_normalizer(states, update=False)
36 |                 demo_states = self.obs_normalizer(demo_states, update=False)
37 |             self.discriminator.train(self.convert_data_to_feed_discriminator(demo_states, demo_actions),
38 |                                      self.convert_data_to_feed_discriminator(states, actions))
39 |             loss_mean += self.discriminator.loss / (self.epochs * self.minibatch_size)
40 |         super(self.__class__, self)._update(dataset)
41 | 
42 |     def _update_if_dataset_is_ready(self):
43 |         # override func
44 |         dataset_size = (
45 |             sum(len(episode) for episode in self.memory)
46 |             + len(self.last_episode)
47 |             + (0 if self.batch_last_episode is None else
48 |                sum(len(episode) for episode in self.batch_last_episode)))
49 |         if dataset_size >= self.update_interval:
50 |             # update reward in self.memory
51 |             self._flush_last_episode()
52 |             transitions = list(chain.from_iterable(self.memory))
53 |             states = self.xp.asarray(np.concatenate([transition['state'][None] for transition in transitions]))
54 |             actions = self.xp.asarray(np.concatenate([transition['action'][None] for transition in transitions]))
55 |             with chainer.configuration.using_config('train', False), chainer.no_backprop_mode():
56 |                 rewards = self.discriminator.get_rewards(self.convert_data_to_feed_discriminator(states, actions)).array
57 |             i = 0
58 |             for episode in self.memory:
59 |                 for transition in episode:
60 |                     transition['reward'] = float(rewards[i])
61 |                     i += 1
62 |             dataset = self._make_dataset()
63 |             assert len(dataset) == dataset_size
64 |             self._update(dataset)
65 |             self.memory = []
66 | 
67 |     def convert_data_to_feed_discriminator(self, states, actions, noise_scale=0.1):
68 |         xp = self.model.xp
69 |         if isinstance(self.model.pi, SoftmaxPolicy):
70 |             # if discrete action
71 |             actions = xp.eye(self.model.pi.model.out_size, dtype=xp.float32)[actions.astype(xp.int32)]
72 |         if noise_scale:
73 |             actions += xp.random.normal(loc=0., scale=noise_scale, size=actions.shape)
74 |         return F.concat((xp.array(states), xp.array(actions)))
75 | 
76 | 
77 | def gailtype_constructor(rl_algo=TRPO):
78 |     _gail_parent = GAIL.mro()[1]
79 |     _gail_func_dict = {func: getattr(GAIL, func) for func in dir(GAIL) if callable(getattr(GAIL, func))
80 |                        and (not func.startswith("__") or func == '__init__')
81 |                        and (not hasattr(_gail_parent, func)
82 |                             or not getattr(GAIL, func) == getattr(_gail_parent, func))}
83 |     return type("GAIL" + rl_algo.__name__.upper(), (rl_algo,), _gail_func_dict)
84 | 
85 | 
86 | # GAILTRPO do not work because TRPO's interface is not compatible with PPO
87 | GAILTRPO = gailtype_constructor(rl_algo=TRPO)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.16.1
2 | chainer==6.0.0b3
3 | gym==0.10.5
4 | chainerrl==0.6.0
5 | 


--------------------------------------------------------------------------------
/train_gym.py:
--------------------------------------------------------------------------------
  1 | """An example of training PPO against OpenAI Gym Envs.
  2 | 
  3 | This script is an example of training a PPO agent against OpenAI Gym envs.
  4 | Both discrete and continuous action spaces are supported.
  5 | 
  6 | To solve CartPole-v0, run:
  7 |     python train_ppo_gym.py --env CartPole-v0
  8 | """
  9 | import argparse
 10 | 
 11 | import chainer
 12 | from chainer import functions as F
 13 | import gym
 14 | import gym.wrappers
 15 | 
 16 | import chainerrl
 17 | from chainerrl.agents import a3c
 18 | from chainerrl.agents import PPO
 19 | from chainerrl import experiments
 20 | from chainerrl import links
 21 | from chainerrl import misc
 22 | from chainerrl.optimizers.nonbias_weight_decay import NonbiasWeightDecay
 23 | from chainerrl import policies
 24 | 
 25 | 
 26 | class A3CFFSoftmax(chainer.ChainList, a3c.A3CModel):
 27 |     """An example of A3C feedforward softmax policy."""
 28 | 
 29 |     def __init__(self, ndim_obs, n_actions, hidden_sizes=(200, 200)):
 30 |         self.pi = policies.SoftmaxPolicy(
 31 |             model=links.MLP(ndim_obs, n_actions, hidden_sizes))
 32 |         self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes)
 33 |         super().__init__(self.pi, self.v)
 34 | 
 35 |     def pi_and_v(self, state):
 36 |         return self.pi(state), self.v(state)
 37 | 
 38 | 
 39 | class A3CFFMellowmax(chainer.ChainList, a3c.A3CModel):
 40 |     """An example of A3C feedforward mellowmax policy."""
 41 | 
 42 |     def __init__(self, ndim_obs, n_actions, hidden_sizes=(200, 200)):
 43 |         self.pi = policies.MellowmaxPolicy(
 44 |             model=links.MLP(ndim_obs, n_actions, hidden_sizes))
 45 |         self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes)
 46 |         super().__init__(self.pi, self.v)
 47 | 
 48 |     def pi_and_v(self, state):
 49 |         return self.pi(state), self.v(state)
 50 | 
 51 | 
 52 | class A3CFFGaussian(chainer.Chain, a3c.A3CModel):
 53 |     """An example of A3C feedforward Gaussian policy."""
 54 | 
 55 |     def __init__(self, obs_size, action_space,
 56 |                  n_hidden_layers=2, n_hidden_channels=64,
 57 |                  bound_mean=None):
 58 |         assert bound_mean in [False, True]
 59 |         super().__init__()
 60 |         hidden_sizes = (n_hidden_channels,) * n_hidden_layers
 61 |         with self.init_scope():
 62 |             self.pi = policies.FCGaussianPolicyWithStateIndependentCovariance(
 63 |                 obs_size, action_space.low.size,
 64 |                 n_hidden_layers, n_hidden_channels,
 65 |                 var_type='diagonal', nonlinearity=F.tanh,
 66 |                 bound_mean=bound_mean,
 67 |                 min_action=action_space.low, max_action=action_space.high,
 68 |                 mean_wscale=1e-2)
 69 |             self.v = links.MLP(obs_size, 1, hidden_sizes=hidden_sizes)
 70 | 
 71 |     def pi_and_v(self, state):
 72 |         return self.pi(state), self.v(state)
 73 | 
 74 | 
 75 | def save_agent_demo(env, agent, out_dir, max_t=2000):
 76 |     import numpy as np
 77 |     r, t = 0, 0
 78 |     agent_observations = []
 79 |     agent_actions = []
 80 |     while t < max_t:
 81 |         agent_observations.append([])
 82 |         agent_actions.append([])
 83 |         obs = env.reset()
 84 |         while True:
 85 |             act = agent.act(obs)
 86 |             agent_observations[-1].append(obs)
 87 |             agent_actions[-1].append(act)
 88 |             obs, reward, done, _ = env.step(act)
 89 |             t += 1
 90 |             r += reward
 91 |             if done or t >= max_t:
 92 |                 print(t)
 93 |                 break
 94 | 
 95 |     # save numpy array consists of lists
 96 |     np.savez(out_dir+'/trajectories.npz', states=np.array(agent_observations, dtype=object),
 97 |              actions=np.array(agent_actions, dtype=object))
 98 | 
 99 | 
100 | def main():
101 |     import logging
102 | 
103 |     parser = argparse.ArgumentParser()
104 |     parser.add_argument('algo', default='ppo', choices=['ppo', 'gail', 'airl'], type=str)
105 |     parser.add_argument('--gpu', type=int, default=0)
106 |     parser.add_argument('--env', type=str, default='Hopper-v2')
107 |     parser.add_argument('--arch', type=str, default='FFGaussian',
108 |                         choices=('FFSoftmax', 'FFMellowmax',
109 |                                  'FFGaussian'))
110 |     parser.add_argument('--bound-mean', action='store_true')
111 |     parser.add_argument('--seed', type=int, default=0,
112 |                         help='Random seed [0, 2 ** 32)')
113 |     parser.add_argument('--outdir', type=str, default='results',
114 |                         help='Directory path to save output files.'
115 |                              ' If it does not exist, it will be created.')
116 |     parser.add_argument('--steps', type=int, default=10 ** 6)
117 |     parser.add_argument('--eval-interval', type=int, default=10000)
118 |     parser.add_argument('--eval-n-runs', type=int, default=10)
119 |     parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
120 |     parser.add_argument('--standardize-advantages', action='store_true')
121 |     parser.add_argument('--render', action='store_true', default=False)
122 |     parser.add_argument('--lr', type=float, default=3e-4)
123 |     parser.add_argument('--weight-decay', type=float, default=0.0)
124 |     parser.add_argument('--demo', action='store_true', default=False)
125 |     parser.add_argument('--load', type=str, default='')
126 |     parser.add_argument('--load_demo', type=str, default='')
127 |     parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
128 |     parser.add_argument('--monitor', action='store_true')
129 | 
130 |     parser.add_argument('--update-interval', type=int, default=2048)
131 |     parser.add_argument('--batchsize', type=int, default=64)
132 |     parser.add_argument('--epochs', type=int, default=10)
133 |     parser.add_argument('--entropy-coef', type=float, default=0.0)
134 |     args = parser.parse_args()
135 | 
136 |     logging.basicConfig(level=args.logger_level)
137 | 
138 |     # Set a random seed used in ChainerRL
139 |     misc.set_random_seed(args.seed, gpus=(args.gpu,))
140 | 
141 |     if not (args.demo and args.load):
142 |         args.outdir = experiments.prepare_output_dir(args, args.outdir)
143 | 
144 |     def make_env(test):
145 |         env = gym.make(args.env)
146 |         # Use different random seeds for train and test envs
147 |         env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
148 |         env.seed(env_seed)
149 |         # Cast observations to float32 because our model uses float32
150 |         env = chainerrl.wrappers.CastObservationToFloat32(env)
151 |         if args.monitor:
152 |             env = gym.wrappers.Monitor(env, args.outdir)
153 |         if not test:
154 |             # Scale rewards (and thus returns) to a reasonable range so that
155 |             # training is easier
156 |             env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
157 |         if args.render:
158 |             env = chainerrl.wrappers.Render(env)
159 |         return env
160 | 
161 |     sample_env = gym.make(args.env)
162 |     timestep_limit = sample_env.spec.tags.get(
163 |         'wrapper_config.TimeLimit.max_episode_steps')
164 |     obs_space = sample_env.observation_space
165 |     action_space = sample_env.action_space
166 | 
167 |     # Normalize observations based on their empirical mean and variance
168 |     obs_normalizer = chainerrl.links.EmpiricalNormalization(
169 |         obs_space.low.size, clip_threshold=5)
170 | 
171 |     # Switch policy types accordingly to action space types
172 |     if args.arch == 'FFSoftmax':
173 |         model = A3CFFSoftmax(obs_space.low.size, action_space.n)
174 |     elif args.arch == 'FFMellowmax':
175 |         model = A3CFFMellowmax(obs_space.low.size, action_space.n)
176 |     elif args.arch == 'FFGaussian':
177 |         model = A3CFFGaussian(obs_space.low.size, action_space,
178 |                               bound_mean=args.bound_mean)
179 | 
180 |     opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
181 |     opt.setup(model)
182 |     if args.weight_decay > 0:
183 |         opt.add_hook(NonbiasWeightDecay(args.weight_decay))
184 |     if args.algo == 'ppo':
185 |         agent = PPO(model, opt,
186 |                     obs_normalizer=obs_normalizer,
187 |                     gpu=args.gpu,
188 |                     update_interval=args.update_interval,
189 |                     minibatch_size=args.batchsize, epochs=args.epochs,
190 |                     clip_eps_vf=None, entropy_coef=args.entropy_coef,
191 |                     standardize_advantages=args.standardize_advantages,
192 |                     )
193 |     elif args.algo == 'gail':
194 |         import numpy as np
195 |         from irl.gail import GAIL
196 |         from irl.gail import Discriminator
197 |         demonstrations = np.load(args.load_demo)
198 |         D = Discriminator(gpu=args.gpu)
199 |         agent = GAIL(demonstrations=demonstrations, discriminator=D,
200 |                      model=model, optimizer=opt,
201 |                      obs_normalizer=obs_normalizer,
202 |                      gpu=args.gpu,
203 |                      update_interval=args.update_interval,
204 |                      minibatch_size=args.batchsize, epochs=args.epochs,
205 |                      clip_eps_vf=None, entropy_coef=args.entropy_coef,
206 |                      standardize_advantages=args.standardize_advantages,)
207 |     elif args.algo == 'airl':
208 |         import numpy as np
209 |         from irl.airl import AIRL as Agent
210 |         from irl.airl import Discriminator
211 |         # obs_normalizer = None
212 |         demonstrations = np.load(args.load_demo)
213 |         D = Discriminator(gpu=args.gpu)
214 |         agent = Agent(demonstrations=demonstrations, discriminator=D,
215 |                       model=model, optimizer=opt,
216 |                       obs_normalizer=obs_normalizer,
217 |                       gpu=args.gpu,
218 |                       update_interval=args.update_interval,
219 |                       minibatch_size=args.batchsize, epochs=args.epochs,
220 |                       clip_eps_vf=None, entropy_coef=args.entropy_coef,
221 |                       standardize_advantages=args.standardize_advantages,)
222 | 
223 |     if args.load:
224 |         agent.load(args.load)
225 | 
226 |     if args.demo:
227 |         env = make_env(True)
228 |         eval_stats = experiments.eval_performance(
229 |             env=env,
230 |             agent=agent,
231 |             n_steps=None,
232 |             n_episodes=args.eval_n_runs,
233 |             max_episode_len=timestep_limit)
234 |         print('n_runs: {} mean: {} median: {} stdev {}'.format(
235 |             args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
236 |             eval_stats['stdev']))
237 |         outdir = args.load if args.load else args.outdir
238 |         save_agent_demo(make_env(False), agent, outdir)
239 |     else:
240 |         # Linearly decay the learning rate to zero
241 |         def lr_setter(env, agent, value):
242 |             agent.optimizer.alpha = value
243 | 
244 |         lr_decay_hook = experiments.LinearInterpolationHook(
245 |             args.steps, args.lr, 0, lr_setter)
246 | 
247 |         # Linearly decay the clipping parameter to zero
248 |         def clip_eps_setter(env, agent, value):
249 |             agent.clip_eps = max(value, 1e-8)
250 | 
251 |         clip_eps_decay_hook = experiments.LinearInterpolationHook(
252 |             args.steps, 0.2, 0, clip_eps_setter)
253 | 
254 |         experiments.train_agent_with_evaluation(
255 |             agent=agent,
256 |             env=make_env(False),
257 |             eval_env=make_env(True),
258 |             outdir=args.outdir,
259 |             steps=args.steps,
260 |             eval_n_steps=None,
261 |             eval_n_episodes=args.eval_n_runs,
262 |             eval_interval=args.eval_interval,
263 |             train_max_episode_len=timestep_limit,
264 |             save_best_so_far_agent=False,
265 |             step_hooks=[
266 |                 lr_decay_hook,
267 |                 clip_eps_decay_hook,
268 |             ],
269 |         )
270 |         save_agent_demo(make_env(False), agent, args.outdir)
271 | 
272 | 
273 | if __name__ == '__main__':
274 |     main()
275 | 


--------------------------------------------------------------------------------