├── .gitignore
├── README.md
├── core.py
├── ddpg
    ├── ddpg.py
    ├── losses.py
    ├── networks.py
    ├── run_bullet.py
    └── run_mujoco.py
├── dqn
    ├── dqn.py
    ├── losses.py
    ├── networks.py
    ├── run_atari.py
    └── run_gym.py
├── fqi
    ├── car_on_hill.py
    ├── dataset_0.800_4.000.pkl
    ├── dataset_0.850_4.000.pkl
    ├── dataset_0.900_4.000.pkl
    ├── dataset_0.950_4.000.pkl
    ├── dataset_1.000_4.000.pkl
    ├── dataset_1.000_4.125.pkl
    ├── dataset_1.000_4.250.pkl
    ├── dataset_1.000_4.375.pkl
    ├── dataset_1.000_4.500.pkl
    ├── dataset_1.050_4.500.pkl
    ├── dataset_1.100_4.500.pkl
    ├── dataset_1.150_4.500.pkl
    ├── dataset_1.200_4.375.pkl
    ├── dataset_1.200_4.500.pkl
    ├── dataset_1.200_4.625.pkl
    ├── dataset_1.200_4.750.pkl
    ├── fqi.py
    ├── losses.py
    ├── networks.py
    ├── run_coh.py
    ├── solver.py
    ├── test_q_0.800_4.000.npy
    ├── test_q_0.850_4.000.npy
    ├── test_q_0.900_4.000.npy
    ├── test_q_0.950_4.000.npy
    ├── test_q_1.000_4.000.npy
    ├── test_q_1.000_4.125.npy
    ├── test_q_1.000_4.250.npy
    ├── test_q_1.000_4.375.npy
    ├── test_q_1.000_4.500.npy
    ├── test_q_1.050_4.500.npy
    ├── test_q_1.100_4.500.npy
    ├── test_q_1.150_4.500.npy
    ├── test_q_1.200_4.375.npy
    ├── test_q_1.200_4.500.npy
    ├── test_q_1.200_4.625.npy
    └── test_q_1.200_4.750.npy
├── policy.py
├── replay_memory.py
└── results
    ├── ddpg
        ├── multi_pendulum
        │   ├── noreg-sigmoid
        │   │   └── scores.npy
        │   └── transfer
        │   │   └── noreg
        │   │       ├── unfreeze0-noreg-sigmoid.npy
        │   │       ├── unfreeze101-noreg-sigmoid.npy
        │   │       └── w.pkl
        ├── multi_walker
        │   ├── noreg-sigmoid
        │   │   └── scores.npy
        │   └── transfer
        │   │   └── noreg
        │   │       ├── unfreeze0-noreg-sigmoid.npy
        │   │       ├── unfreeze101-noreg-sigmoid.npy
        │   │       └── w.pkl
        ├── scores-plot.py
        ├── single
        │   └── noreg-sigmoid
        │   │   ├── InvertedDoublePendulumBulletEnv-v0.npy
        │   │   ├── InvertedPendulumBulletEnv-v0.npy
        │   │   ├── InvertedPendulumSwingupBulletEnv-v0.npy
        │   │   ├── chee_run.npy
        │   │   ├── hop_stand.npy
        │   │   └── walk_walk.npy
        └── transfer-plot.py
    ├── dqn
        ├── dqn
        │   └── noreg-sigmoid
        │   │   └── scores.npy
        ├── multidqn
        │   └── noreg-sigmoid
        │   │   └── scores.npy
        ├── scores_plot.py
        ├── transfer-plot.py
        └── transfer
        │   └── acro-noreg
        │       ├── noreg-cart_mc_coh_pend.pkl
        │       ├── unfreeze0-noreg-sigmoid.npy
        │       ├── unfreeze10-noreg-sigmoid.npy
        │       └── unfreeze51-noreg-sigmoid.npy
    └── fqi
        ├── 0.800_4.000
            ├── avi_diff.npy
            └── scores.npy
        ├── 1.000_4.000
            ├── avi_diff.npy
            └── scores.npy
        ├── 1.000_4.0000.800_4.000
            ├── avi_diff.npy
            └── scores.npy
        ├── 1.000_4.0000.800_4.0001.000_4.5001.200_4.500
            ├── avi_diff.npy
            └── scores.npy
        ├── 1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000
            ├── avi_diff.npy
            └── scores.npy
        ├── 1.000_4.500
            ├── avi_diff.npy
            └── scores.npy
        ├── 1.200_4.500
            ├── avi_diff.npy
            └── scores.npy
        ├── avi_scores_plot.py
        └── multi_avi_scores_plot.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | dist/
 3 | mushroom.egg-info/
 4 | .idea/
 5 | *.pyc
 6 | *.xml
 7 | logs/
 8 | *.h5
 9 | */logs
10 | *_raw.npy
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Code of the experimental evaluation of ICLR2020 paper: "Sharing Knowledge in Multi-Task Deep Reinforcement Learning" (https://openreview.net/forum?id=rkgpv2VFvr).
2 | 


--------------------------------------------------------------------------------
/core.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | 
  3 | 
  4 | class Core(object):
  5 |     def __init__(self, agent, mdp, callbacks=None):
  6 |         self.agent = agent
  7 |         self.mdp = mdp
  8 |         self._n_mdp = len(self.mdp)
  9 |         self.callbacks = callbacks if callbacks is not None else list()
 10 | 
 11 |         self._state = [None for _ in range(self._n_mdp)]
 12 | 
 13 |         self._total_steps_counter = 0
 14 |         self._current_steps_counter = 0
 15 |         self._episode_steps = [None for _ in range(self._n_mdp)]
 16 |         self._n_steps_per_fit = None
 17 | 
 18 |     def learn(self, n_steps=None, n_steps_per_fit=None, render=False,
 19 |               quiet=False):
 20 |         self._n_steps_per_fit = n_steps_per_fit
 21 | 
 22 |         fit_condition = \
 23 |             lambda: self._current_steps_counter >= self._n_steps_per_fit
 24 | 
 25 |         self._run(n_steps, fit_condition, render, quiet)
 26 | 
 27 |     def evaluate(self, n_steps=None, render=False,
 28 |                  quiet=False):
 29 |         fit_condition = lambda: False
 30 | 
 31 |         return self._run(n_steps, fit_condition, render, quiet)
 32 | 
 33 |     def _run(self, n_steps, fit_condition, render, quiet):
 34 |         move_condition = lambda: self._total_steps_counter < n_steps
 35 | 
 36 |         steps_progress_bar = tqdm(total=n_steps,
 37 |                                   dynamic_ncols=True, disable=quiet,
 38 |                                   leave=False)
 39 | 
 40 |         return self._run_impl(move_condition, fit_condition, steps_progress_bar,
 41 |                               render)
 42 | 
 43 |     def _run_impl(self, move_condition, fit_condition, steps_progress_bar,
 44 |                   render):
 45 |         self._total_steps_counter = 0
 46 |         self._current_steps_counter = 0
 47 | 
 48 |         dataset = list()
 49 |         last = [True] * self._n_mdp
 50 |         while move_condition():
 51 |             for i in range(self._n_mdp):
 52 |                 if last[i]:
 53 |                     self.reset(i)
 54 | 
 55 |                 sample = self._step(i, render)
 56 |                 dataset.append(sample)
 57 | 
 58 |                 last[i] = sample[-1]
 59 | 
 60 |             self._total_steps_counter += 1
 61 |             self._current_steps_counter += 1
 62 |             steps_progress_bar.update(1)
 63 | 
 64 |             if fit_condition():
 65 |                 self.agent.fit(dataset)
 66 |                 self._current_steps_counter = 0
 67 | 
 68 |                 for c in self.callbacks:
 69 |                     callback_pars = dict(dataset=dataset)
 70 |                     c(**callback_pars)
 71 | 
 72 |                 dataset = list()
 73 | 
 74 |         self.agent.stop()
 75 |         for i in range(self._n_mdp):
 76 |             self.mdp[i].stop()
 77 | 
 78 |         return dataset
 79 | 
 80 |     def _step(self, i, render):
 81 |         action = self.agent.draw_action([i, self._state[i]])
 82 |         next_state, reward, absorbing, _ = self.mdp[i].step(action)
 83 | 
 84 |         self._episode_steps[i] += 1
 85 | 
 86 |         if render:
 87 |             self.mdp[i].render()
 88 | 
 89 |         last = not(
 90 |             self._episode_steps[i] < self.mdp[i].info.horizon and not absorbing)
 91 | 
 92 |         state = self._state[i]
 93 |         self._state[i] = next_state.copy()
 94 | 
 95 |         return [i, state], action, reward, [i, next_state], absorbing, last
 96 | 
 97 |     def reset(self, i):
 98 |         self._state[i] = self.mdp[i].reset().copy()
 99 |         self.agent.episode_start()
100 |         self.agent.next_action = None
101 |         self._episode_steps[i] = 0
102 | 


--------------------------------------------------------------------------------
/ddpg/ddpg.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | import numpy as np
  4 | 
  5 | import torch.nn as nn
  6 | from mushroom_rl.algorithms import Agent
  7 | from mushroom_rl.approximators import Regressor
  8 | 
  9 | from replay_memory import ReplayMemory
 10 | 
 11 | 
 12 | class ActorLoss(nn.Module):
 13 |     def __init__(self, critic):
 14 |         super().__init__()
 15 | 
 16 |         self._critic = critic
 17 | 
 18 |     def forward(self, arg, state, idxs):
 19 |         action = arg
 20 | 
 21 |         q = self._critic.model.network(state, action, idx=idxs)
 22 | 
 23 |         return -q.mean()
 24 | 
 25 | 
 26 | class DDPG(Agent):
 27 |     def __init__(self, actor_approximator, critic_approximator, policy_class,
 28 |                  mdp_info, batch_size, initial_replay_size, max_replay_size,
 29 |                  tau, actor_params, critic_params, policy_params,
 30 |                  n_actions_per_head, history_length=1, n_input_per_mdp=None,
 31 |                  n_games=1, dtype=np.uint8):
 32 |         self._batch_size = batch_size
 33 |         self._n_games = n_games
 34 |         if n_input_per_mdp is None:
 35 |             self._n_input_per_mdp = [mdp_info.observation_space.shape
 36 |                                      for _ in range(self._n_games)]
 37 |         else:
 38 |             self._n_input_per_mdp = n_input_per_mdp
 39 |         self._n_actions_per_head = n_actions_per_head
 40 |         self._max_actions = max(n_actions_per_head)[0]
 41 |         self._history_length = history_length
 42 |         self._tau = tau
 43 | 
 44 |         self._replay_memory = [
 45 |             ReplayMemory(initial_replay_size,
 46 |                          max_replay_size) for _ in range(self._n_games)
 47 |         ]
 48 | 
 49 |         self._n_updates = 0
 50 | 
 51 |         target_critic_params = deepcopy(critic_params)
 52 |         self._critic_approximator = Regressor(critic_approximator,
 53 |                                               **critic_params)
 54 |         self._target_critic_approximator = Regressor(critic_approximator,
 55 |                                                      **target_critic_params)
 56 | 
 57 |         if 'loss' not in actor_params:
 58 |             actor_params['loss'] = ActorLoss(self._critic_approximator)
 59 | 
 60 |         target_actor_params = deepcopy(actor_params)
 61 |         self._actor_approximator = Regressor(actor_approximator,
 62 |                                              n_fit_targets=2, **actor_params)
 63 |         self._target_actor_approximator = Regressor(actor_approximator,
 64 |                                                     n_fit_targets=2,
 65 |                                                     **target_actor_params)
 66 | 
 67 |         self._target_actor_approximator.model.set_weights(
 68 |             self._actor_approximator.model.get_weights())
 69 |         self._target_critic_approximator.model.set_weights(
 70 |             self._critic_approximator.model.get_weights())
 71 | 
 72 |         policy = policy_class(self._actor_approximator, **policy_params)
 73 | 
 74 |         super().__init__(mdp_info, policy)
 75 | 
 76 |         n_samples = self._batch_size * self._n_games
 77 |         self._state_idxs = np.zeros(n_samples, dtype=np.int)
 78 |         self._state = np.zeros(
 79 |             ((n_samples,
 80 |              self._history_length) + self.mdp_info.observation_space.shape),
 81 |             dtype=dtype
 82 |         ).squeeze()
 83 |         self._action = np.zeros((n_samples, self._max_actions))
 84 |         self._reward = np.zeros(n_samples)
 85 |         self._next_state_idxs = np.zeros(n_samples, dtype=np.int)
 86 |         self._next_state = np.zeros(
 87 |             ((n_samples,
 88 |              self._history_length) + self.mdp_info.observation_space.shape),
 89 |             dtype=dtype
 90 |         ).squeeze()
 91 |         self._absorbing = np.zeros(n_samples)
 92 | 
 93 |     def fit(self, dataset):
 94 |         s = np.array([d[0][0] for d in dataset]).ravel()
 95 |         games = np.unique(s)
 96 |         for g in games:
 97 |             idxs = np.argwhere(s == g).ravel()
 98 |             d = list()
 99 |             for idx in idxs:
100 |                 d.append(dataset[idx])
101 | 
102 |             self._replay_memory[g].add(d)
103 | 
104 |         fit_condition = np.all([rm.initialized for rm in self._replay_memory])
105 | 
106 |         if fit_condition:
107 |             for i in range(len(self._replay_memory)):
108 |                 game_state, game_action, game_reward, game_next_state,\
109 |                     game_absorbing, _ = self._replay_memory[i].get(
110 |                         self._batch_size)
111 | 
112 |                 start = self._batch_size * i
113 |                 stop = start + self._batch_size
114 | 
115 |                 self._state_idxs[start:stop] = np.ones(self._batch_size) * i
116 |                 self._state[start:stop, :self._n_input_per_mdp[i][0]] = game_state
117 |                 self._action[start:stop, :self._n_actions_per_head[i][0]] = game_action
118 |                 self._reward[start:stop] = game_reward
119 |                 self._next_state_idxs[start:stop] = np.ones(self._batch_size) * i
120 |                 self._next_state[start:stop, :self._n_input_per_mdp[i][0]] = game_next_state
121 |                 self._absorbing[start:stop] = game_absorbing
122 | 
123 |             q_next = self._next_q()
124 |             q = self._reward + q_next
125 | 
126 |             self._critic_approximator.fit(self._state, self._action, q,
127 |                                           idx=self._state_idxs)
128 |             self._actor_approximator.fit(self._state, self._state,
129 |                                          self._state_idxs,
130 |                                          idx=self._state_idxs)
131 | 
132 |             self._n_updates += 1
133 | 
134 |             self._update_target()
135 | 
136 |     def get_shared_weights(self):
137 |         cw = self._critic_approximator.model.network.get_shared_weights()
138 |         aw = self._actor_approximator.model.network.get_shared_weights()
139 | 
140 |         return [cw, aw]
141 | 
142 |     def set_shared_weights(self, weights):
143 |         self._critic_approximator.model.network.set_shared_weights(weights[0])
144 |         self._actor_approximator.model.network.set_shared_weights(weights[1])
145 | 
146 |     def freeze_shared_weights(self):
147 |         self._critic_approximator.model.network.freeze_shared_weights()
148 |         self._actor_approximator.model.network.freeze_shared_weights()
149 | 
150 |     def unfreeze_shared_weights(self):
151 |         self._critic_approximator.model.network.unfreeze_shared_weights()
152 |         self._actor_approximator.model.network.unfreeze_shared_weights()
153 | 
154 |     def _update_target(self):
155 |         """
156 |         Update the target networks.
157 | 
158 |         """
159 |         critic_weights = self._tau * self._critic_approximator.model.get_weights()
160 |         critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights()
161 |         self._target_critic_approximator.set_weights(critic_weights)
162 | 
163 |         actor_weights = self._tau * self._actor_approximator.model.get_weights()
164 |         actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights()
165 |         self._target_actor_approximator.set_weights(actor_weights)
166 | 
167 |     def _next_q(self):
168 |         a = self._target_actor_approximator(self._next_state,
169 |                                             idx=self._next_state_idxs)
170 |         q = self._target_critic_approximator(self._next_state, a,
171 |                                              idx=self._next_state_idxs).ravel()
172 | 
173 |         out_q = np.zeros(self._batch_size * self._n_games)
174 |         for i in range(self._n_games):
175 |             start = self._batch_size * i
176 |             stop = start + self._batch_size
177 | 
178 |             out_q[start:stop] = q[start:stop] * self.mdp_info.gamma[i]
179 |             if np.any(self._absorbing[start:stop]):
180 |                 out_q[start:stop] = out_q[start:stop] * (
181 |                     1 - self._absorbing[start:stop]
182 |                 )
183 | 
184 |         return out_q
185 | 


--------------------------------------------------------------------------------
/ddpg/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class LossFunction(object):
 6 |     def __init__(self, n_games, batch_size, eval_frequency):
 7 |         self._n_games = n_games
 8 |         self._batch_size = batch_size
 9 |         self._eval_frequency = eval_frequency
10 | 
11 |         self._losses = list()
12 |         self._counter = 0
13 | 
14 |     def get_losses(self):
15 |         return self._losses
16 | 
17 |     def __call__(self, yhat, y):
18 |         loss = F.smooth_l1_loss(yhat, y, reduce=False)
19 | 
20 |         if self._need_log():
21 |             temp_losses = list()
22 | 
23 |             for i in range(self._n_games):
24 |                 start = i * self._batch_size
25 |                 stop = start + self._batch_size
26 |                 temp_losses.append(torch.mean(loss[start:stop]).item())
27 | 
28 |             self._losses.append(temp_losses)
29 | 
30 |         loss = torch.mean(loss)
31 | 
32 |         return loss
33 | 
34 |     def _need_log(self):
35 |         self._counter += 1
36 |         if self._counter >= self._eval_frequency:
37 |             self._counter = 0
38 |             return True
39 |         else:
40 |             return False
41 | 


--------------------------------------------------------------------------------
/ddpg/networks.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | 
  6 | 
  7 | class ActorNetwork(nn.Module):
  8 |     def __init__(self, input_shape, _, n_actions_per_head, n_hidden_1,
  9 |                  n_hidden_2, use_cuda, features, dropout):
 10 |         super().__init__()
 11 | 
 12 |         self._n_input = input_shape
 13 |         self._n_games = len(n_actions_per_head)
 14 |         self._max_actions = max(n_actions_per_head)[0]
 15 |         self._use_cuda = use_cuda
 16 |         self._features = features
 17 |         self._n_shared = 2
 18 | 
 19 |         self._h1 = nn.ModuleList(
 20 |             [nn.Linear(self._n_input[i][0], n_hidden_1) for i in range(
 21 |                 len(input_shape))]
 22 |         )
 23 |         self._h2 = nn.Linear(n_hidden_1, n_hidden_2)
 24 |         self._h3 = nn.ModuleList(
 25 |             [nn.Linear(n_hidden_2, self._max_actions) for _ in range(
 26 |                 self._n_games)]
 27 |         )
 28 | 
 29 |         fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._h2.weight)
 30 |         nn.init.uniform_(self._h2.weight, a=-1 / np.sqrt(fan_in),
 31 |                          b=1 / np.sqrt(fan_in))
 32 |         for i in range(self._n_games):
 33 |             fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._h1[i].weight)
 34 |             nn.init.uniform_(self._h1[i].weight, a=-1 / np.sqrt(fan_in),
 35 |                              b=1 / np.sqrt(fan_in))
 36 |             nn.init.uniform_(self._h3[i].weight, a=-3e-3, b=3e-3)
 37 |             nn.init.uniform_(self._h3[i].bias, a=-3e-3, b=3e-3)
 38 | 
 39 |     def forward(self, state, idx=None, get_features=False):
 40 |         state = state.float()
 41 | 
 42 |         h1 = list()
 43 |         for i in np.unique(idx):
 44 |             idxs = np.argwhere(idx == i).ravel()
 45 |             h1.append(F.relu(self._h1[i](state[idxs, :self._n_input[i][0]])))
 46 |         cat_h1 = torch.cat(h1)
 47 | 
 48 |         h_f = F.relu(self._h2(cat_h1))
 49 | 
 50 |         a = [torch.tanh(self._h3[i](h_f)) for i in range(self._n_games)]
 51 |         a = torch.stack(a, dim=1)
 52 | 
 53 |         if idx is not None:
 54 |             idx = torch.from_numpy(idx)
 55 |             if self._use_cuda:
 56 |                 idx = idx.cuda()
 57 |             a_idx = a.gather(1, idx.view(-1, 1).repeat(
 58 |                 1, self._max_actions).unsqueeze(1)
 59 |                 )
 60 | 
 61 |             a = torch.squeeze(a_idx, 1)
 62 | 
 63 |         if get_features:
 64 |             return a, h_f
 65 |         else:
 66 |             return a
 67 | 
 68 |     def get_shared_weights(self):
 69 |         p2 = list()
 70 | 
 71 |         for p in self._h2.parameters():
 72 |             p2.append(p.data.detach().cpu().numpy())
 73 | 
 74 |         return p2
 75 | 
 76 |     def set_shared_weights(self, weights):
 77 |         w2 = weights
 78 | 
 79 |         for p, w in zip(self._h2.parameters(), w2):
 80 |             w_tensor = torch.from_numpy(w).type(p.data.dtype)
 81 |             if self._use_cuda:
 82 |                 w_tensor = w_tensor.cuda()
 83 |             p.data = w_tensor
 84 | 
 85 |     def freeze_shared_weights(self):
 86 |         for p in self._h2.parameters():
 87 |             p.requires_grad = False
 88 | 
 89 |     def unfreeze_shared_weights(self):
 90 |         for p in self._h2.parameters():
 91 |             p.requires_grad = True
 92 | 
 93 | 
 94 | class CriticNetwork(nn.Module):
 95 |     def __init__(self, input_shape, _, n_actions_per_head, n_hidden_1,
 96 |                  n_hidden_2, use_cuda, features, dropout):
 97 |         super().__init__()
 98 | 
 99 |         self._n_input = input_shape
100 |         self._n_games = len(n_actions_per_head)
101 |         self._max_actions = max(n_actions_per_head)[0]
102 |         self._n_actions_per_head = n_actions_per_head
103 |         self._use_cuda = use_cuda
104 |         self._features = features
105 |         self._n_shared = 2
106 | 
107 |         self._h1 = nn.ModuleList(
108 |             [nn.Linear(self._n_input[i][0], n_hidden_1) for i in range(
109 |                 len(input_shape))]
110 |         )
111 |         self._h2_s = nn.Linear(n_hidden_1, n_hidden_2)
112 |         self._h3 = nn.ModuleList(
113 |             [nn.Linear(n_hidden_2, 1) for _ in range(
114 |                 self._n_games)]
115 |         )
116 |         self._h2_a = nn.ModuleList(
117 |             [nn.Linear(n_actions_per_head[i][0], n_hidden_2, bias=False) for i in range(
118 |                 len(n_actions_per_head))]
119 |         )
120 | 
121 |         fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._h2_s.weight)
122 |         nn.init.uniform_(self._h2_s.weight, a=-1 / np.sqrt(fan_in),
123 |                          b=1 / np.sqrt(fan_in))
124 |         for i in range(self._n_games):
125 |             fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
126 |                 self._h2_a[i].weight)
127 |             nn.init.uniform_(self._h2_a[i].weight, a=-1 / np.sqrt(fan_in),
128 |                              b=1 / np.sqrt(fan_in))
129 |             fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._h1[i].weight)
130 |             nn.init.uniform_(self._h1[i].weight, a=-1 / np.sqrt(fan_in),
131 |                              b=1 / np.sqrt(fan_in))
132 |             nn.init.uniform_(self._h3[i].weight, a=-3e-3, b=3e-3)
133 |             nn.init.uniform_(self._h3[i].bias, a=-3e-3, b=3e-3)
134 | 
135 |     def forward(self, state, action, idx=None):
136 |         state = state.float()
137 |         action = action.float()
138 |         if not isinstance(idx, np.ndarray):
139 |             idx = idx.cpu().numpy().astype(np.int)
140 | 
141 |         h2 = list()
142 |         for i in np.unique(idx):
143 |             idxs = np.argwhere(idx == i).ravel()
144 |             h1 = F.relu(self._h1[i](state[idxs, :self._n_input[i][0]]))
145 |             a = action[idxs, :self._n_actions_per_head[i][0]]
146 |             h2.append(self._h2_s(h1) + self._h2_a[i](a))
147 | 
148 |         cat_h2 = torch.cat(h2)
149 | 
150 |         if self._features == 'relu':
151 |             h_f = F.relu(cat_h2)
152 |         elif self._features == 'sigmoid':
153 |             h_f = torch.sigmoid(cat_h2)
154 |         else:
155 |             raise ValueError
156 | 
157 |         q = [self._h3[i](h_f) for i in range(self._n_games)]
158 |         q = torch.stack(q, dim=1).squeeze(-1)
159 | 
160 |         if idx is not None:
161 |             idx = torch.from_numpy(idx)
162 |             if self._use_cuda:
163 |                 idx = idx.cuda()
164 | 
165 |             q_idx = q.gather(1, idx.unsqueeze(-1))
166 |             q = torch.squeeze(q_idx, 1)
167 | 
168 |         return q
169 | 
170 |     def get_shared_weights(self):
171 |         p2 = list()
172 | 
173 |         for p in self._h2_s.parameters():
174 |             p2.append(p.data.detach().cpu().numpy())
175 | 
176 |         return p2
177 | 
178 |     def set_shared_weights(self, weights):
179 |         w2 = weights
180 | 
181 |         for p, w in zip(self._h2_s.parameters(), w2):
182 |             w_tensor = torch.from_numpy(w).type(p.data.dtype)
183 |             if self._use_cuda:
184 |                 w_tensor = w_tensor.cuda()
185 |             p.data = w_tensor
186 | 
187 |     def freeze_shared_weights(self):
188 |         for p in self._h2_s.parameters():
189 |             p.requires_grad = False
190 | 
191 |     def unfreeze_shared_weights(self):
192 |         for p in self._h2_s.parameters():
193 |             p.requires_grad = True
194 | 


--------------------------------------------------------------------------------
/ddpg/run_bullet.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import pathlib
  4 | import sys
  5 | 
  6 | from joblib import delayed, Parallel
  7 | import numpy as np
  8 | import torch.optim as optim
  9 | 
 10 | import pickle
 11 | 
 12 | sys.path.append('..')
 13 | 
 14 | from mushroom_rl.approximators.parametric import TorchApproximator
 15 | from mushroom_rl.environments import *
 16 | from mushroom_rl.utils.dataset import compute_J
 17 | 
 18 | from core import Core
 19 | from ddpg import DDPG
 20 | from policy import OrnsteinUhlenbeckPolicy
 21 | 
 22 | from networks import ActorNetwork, CriticNetwork
 23 | from losses import LossFunction
 24 | 
 25 | 
 26 | def print_epoch(epoch):
 27 |     print('################################################################')
 28 |     print('Epoch: ', epoch)
 29 |     print('----------------------------------------------------------------')
 30 | 
 31 | 
 32 | def get_stats(dataset, gamma, idx, domains):
 33 |     J = np.mean(compute_J(dataset, gamma[idx]))
 34 |     print(domains[idx] + ': J: %f' % J)
 35 | 
 36 |     return J
 37 | 
 38 | 
 39 | def experiment(idx, args):
 40 |     np.random.seed()
 41 | 
 42 |     domains = [''.join(g) for g in args.games]
 43 | 
 44 |     scores = list()
 45 |     for _ in range(len(domains)):
 46 |         scores.append(list())
 47 | 
 48 |     optimizer_actor = dict()
 49 |     optimizer_actor['class'] = optim.Adam
 50 |     optimizer_actor['params'] = dict(lr=args.learning_rate_actor)
 51 | 
 52 |     optimizer_critic = dict()
 53 |     optimizer_critic['class'] = optim.Adam
 54 |     optimizer_critic['params'] = dict(lr=args.learning_rate_critic,
 55 |                                       weight_decay=1e-2)
 56 | 
 57 |     # MDP
 58 |     mdp = list()
 59 |     gamma_eval = list()
 60 |     for i, g in enumerate(domains):
 61 |         mdp.append(Gym(g, args.horizon[i], args.gamma[i]))
 62 |         gamma_eval.append(args.gamma[i])
 63 |     if args.render:
 64 |         mdp[0].render(mode='human')
 65 | 
 66 |     n_input_per_mdp = [m.info.observation_space.shape for m in mdp]
 67 |     n_actions_per_head = [(m.info.action_space.shape[0],) for m in mdp]
 68 | 
 69 |     max_obs_dim = 0
 70 |     max_act_n = 0
 71 |     for i in range(len(domains)):
 72 |         n = mdp[i].info.observation_space.shape[0]
 73 |         m = len(mdp[i].info.action_space.shape)
 74 |         if n > max_obs_dim:
 75 |             max_obs_dim = n
 76 |             max_obs_idx = i
 77 |         if m > max_act_n:
 78 |             max_act_n = m
 79 |             max_act_idx = i
 80 |     gammas = [m.info.gamma for m in mdp]
 81 |     horizons = [m.info.horizon for m in mdp]
 82 |     mdp_info = MDPInfo(mdp[max_obs_idx].info.observation_space,
 83 |                        mdp[max_act_idx].info.action_space, gammas, horizons)
 84 |     max_action_value = list()
 85 |     for m in mdp:
 86 |         assert len(np.unique(m.info.action_space.low)) == 1
 87 |         assert len(np.unique(m.info.action_space.high)) == 1
 88 |         assert abs(m.info.action_space.low[0]) == m.info.action_space.high[0]
 89 | 
 90 |         max_action_value.append(m.info.action_space.high[0])
 91 | 
 92 |     # DQN learning run
 93 | 
 94 |     # Settings
 95 |     if args.debug:
 96 |         initial_replay_size = args.batch_size
 97 |         max_replay_size = 500
 98 |         test_samples = 20
 99 |         evaluation_frequency = 50
100 |         max_steps = 1000
101 |     else:
102 |         initial_replay_size = args.initial_replay_size
103 |         max_replay_size = args.max_replay_size
104 |         test_samples = args.test_samples
105 |         evaluation_frequency = args.evaluation_frequency
106 |         max_steps = args.max_steps
107 | 
108 |     # Policy
109 |     policy_class = OrnsteinUhlenbeckPolicy
110 |     policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2,
111 |                          n_actions_per_head=n_actions_per_head,
112 |                          max_action_value=max_action_value)
113 | 
114 |     # Approximator
115 |     n_games = len(args.games)
116 |     loss = LossFunction(n_games, args.batch_size, evaluation_frequency)
117 | 
118 |     actor_approximator = TorchApproximator
119 |     actor_input_shape = [m.info.observation_space.shape for m in mdp]
120 | 
121 |     actor_approximator_params = dict(
122 |         network=ActorNetwork,
123 |         input_shape=actor_input_shape,
124 |         output_shape=(max(n_actions_per_head)[0],),
125 |         n_actions_per_head=n_actions_per_head,
126 |         n_hidden_1=args.hidden_neurons[0],
127 |         n_hidden_2=args.hidden_neurons[1],
128 |         optimizer=optimizer_actor,
129 |         use_cuda=args.use_cuda,
130 |         features=args.features
131 |     )
132 | 
133 |     critic_approximator = TorchApproximator
134 |     critic_input_shape = [m.info.observation_space.shape for m in mdp]
135 |     critic_approximator_params = dict(
136 |         network=CriticNetwork,
137 |         input_shape=critic_input_shape,
138 |         output_shape=(1,),
139 |         n_actions_per_head=n_actions_per_head,
140 |         n_hidden_1=args.hidden_neurons[0],
141 |         n_hidden_2=args.hidden_neurons[1],
142 |         optimizer=optimizer_actor,
143 |         loss=loss,
144 |         use_cuda=args.use_cuda,
145 |         features=args.features
146 |     )
147 | 
148 |     # Agent
149 |     algorithm_params = dict(
150 |         batch_size=args.batch_size,
151 |         initial_replay_size=initial_replay_size,
152 |         max_replay_size=max_replay_size,
153 |         tau=args.tau,
154 |         actor_params=actor_approximator_params,
155 |         critic_params=critic_approximator_params,
156 |         policy_params=policy_params,
157 |         n_games=len(domains),
158 |         n_input_per_mdp=n_input_per_mdp,
159 |         n_actions_per_head=n_actions_per_head,
160 |         dtype=np.float32
161 |     )
162 | 
163 |     agent = DDPG(actor_approximator, critic_approximator, policy_class,
164 |                  mdp_info, **algorithm_params)
165 | 
166 |     # Algorithm
167 |     core = Core(agent, mdp)
168 | 
169 |     # RUN
170 | 
171 |     # Fill replay memory with random dataset
172 |     print_epoch(0)
173 |     core.learn(n_steps=initial_replay_size,
174 |                n_steps_per_fit=initial_replay_size, quiet=args.quiet)
175 | 
176 |     if args.transfer:
177 |         weights = pickle.load(open(args.transfer, 'rb'))
178 |         agent.set_shared_weights(weights)
179 | 
180 |     if args.load:
181 |         weights = np.load(args.load)
182 |         agent.policy.set_weights(weights)
183 | 
184 |     # Evaluate initial policy
185 |     agent.policy.eval = True
186 |     dataset = core.evaluate(n_steps=test_samples, render=args.render,
187 |                             quiet=args.quiet)
188 |     agent.policy.eval = False
189 |     for i in range(len(mdp)):
190 |         d = dataset[i::len(mdp)]
191 |         scores[i].append(get_stats(d, gamma_eval, i, domains))
192 | 
193 |     if args.unfreeze_epoch > 0:
194 |         agent.freeze_shared_weights()
195 | 
196 |     best_score_sum = -np.inf
197 |     best_weights = None
198 | 
199 |     np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
200 |     np.save(folder_name + 'critic_loss-exp-%d.npy' % idx,
201 |             agent._critic_approximator.model._loss.get_losses())
202 |     for n_epoch in range(1, max_steps // evaluation_frequency + 1):
203 |         if n_epoch >= args.unfreeze_epoch > 0:
204 |             agent.unfreeze_shared_weights()
205 | 
206 |         print_epoch(n_epoch)
207 |         print('- Learning:')
208 |         # learning step
209 |         core.learn(n_steps=evaluation_frequency,
210 |                    n_steps_per_fit=1, quiet=args.quiet)
211 | 
212 |         print('- Evaluation:')
213 |         # evaluation step
214 |         agent.policy.eval = True
215 |         dataset = core.evaluate(n_steps=test_samples,
216 |                                 render=args.render, quiet=args.quiet)
217 |         agent.policy.eval = False
218 | 
219 |         current_score_sum = 0
220 |         for i in range(len(mdp)):
221 |             d = dataset[i::len(mdp)]
222 |             current_score = get_stats(d, gamma_eval, i, domains)
223 |             scores[i].append(current_score)
224 |             current_score_sum += current_score
225 | 
226 |         # Save shared weights if best score
227 |         if args.save_shared and current_score_sum >= best_score_sum:
228 |             best_score_sum = current_score_sum
229 |             best_weights = agent.get_shared_weights()
230 | 
231 |         if args.save:
232 |             np.save(folder_name + 'best_weights-exp-%d.npy' % idx,
233 |                     agent.policy.get_weights())
234 | 
235 |         np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
236 |         np.save(folder_name + 'critic_loss-exp-%d.npy' % idx,
237 |                 agent._critic_approximator.model._loss.get_losses())
238 | 
239 |     if args.save_shared:
240 |         pickle.dump(best_weights, open(args.save_shared, 'wb'))
241 | 
242 |     return scores, agent._critic_approximator.model._loss.get_losses()
243 | 
244 | 
245 | if __name__ == '__main__':
246 |     # Argument parser
247 |     parser = argparse.ArgumentParser()
248 | 
249 |     arg_game = parser.add_argument_group('Game')
250 |     arg_game.add_argument("--games", type=list, nargs='+',
251 |                           default=['AntBulletEnv-v0'])
252 |     arg_game.add_argument("--horizon", type=int, nargs='+')
253 |     arg_game.add_argument("--gamma", type=float, nargs='+')
254 |     arg_game.add_argument("--n-exp", type=int)
255 | 
256 |     arg_mem = parser.add_argument_group('Replay Memory')
257 |     arg_mem.add_argument("--initial-replay-size", type=int, default=64,
258 |                          help='Initial size of the replay memory.')
259 |     arg_mem.add_argument("--max-replay-size", type=int, default=50000,
260 |                          help='Max size of the replay memory.')
261 | 
262 |     arg_net = parser.add_argument_group('Deep Q-Network')
263 |     arg_net.add_argument("--hidden-neurons", type=int, nargs=2,
264 |                          default=[600, 500])
265 |     arg_net.add_argument("--learning-rate-actor", type=float, default=1e-4,
266 |                          help='Learning rate value of the optimizer. Only used'
267 |                               'in rmspropcentered')
268 |     arg_net.add_argument("--learning-rate-critic", type=float, default=1e-3,
269 |                          help='Learning rate value of the optimizer. Only used'
270 |                               'in rmspropcentered')
271 | 
272 |     arg_alg = parser.add_argument_group('Algorithm')
273 |     arg_alg.add_argument("--features", choices=['relu', 'sigmoid'])
274 |     arg_alg.add_argument("--batch-size", type=int, default=64,
275 |                          help='Batch size for each fit of the network.')
276 |     arg_alg.add_argument("--tau", type=float, default=1e-3)
277 |     arg_alg.add_argument("--history-length", type=int, default=1,
278 |                          help='Number of frames composing a state.')
279 |     arg_alg.add_argument("--evaluation-frequency", type=int, default=10000,
280 |                          help='Number of learning step before each evaluation.'
281 |                               'This number represents an epoch.')
282 |     arg_alg.add_argument("--max-steps", type=int, default=1000000,
283 |                          help='Total number of learning steps.')
284 |     arg_alg.add_argument("--test-samples", type=int, default=5000,
285 |                          help='Number of steps for each evaluation.')
286 |     arg_alg.add_argument("--transfer", type=str, default='',
287 |                          help='Path to  the file of the weights of the common '
288 |                               'layers to be loaded')
289 |     arg_alg.add_argument("--save-shared", type=str, default='',
290 |                          help='filename where to save the shared weights')
291 |     arg_alg.add_argument("--unfreeze-epoch", type=int, default=0,
292 |                          help="Number of epoch where to unfreeze shared weights.")
293 | 
294 |     arg_utils = parser.add_argument_group('Utils')
295 |     arg_utils.add_argument('--use-cuda', action='store_true',
296 |                            help='Flag specifying whether to use the GPU.')
297 |     arg_utils.add_argument('--load', type=str,
298 |                            help='Path of the model to be loaded.')
299 |     arg_utils.add_argument('--save', action='store_true',
300 |                            help='Flag specifying whether to save the model.')
301 |     arg_utils.add_argument('--render', action='store_true',
302 |                            help='Flag specifying whether to render the game.')
303 |     arg_utils.add_argument('--quiet', action='store_true',
304 |                            help='Flag specifying whether to hide the progress'
305 |                                 'bar.')
306 |     arg_utils.add_argument('--debug', action='store_true',
307 |                            help='Flag specifying whether the script has to be'
308 |                                 'run in debug mode.')
309 |     arg_utils.add_argument('--postfix', type=str, default='',
310 |                            help='Flag used to add a postfix to the folder name')
311 | 
312 |     args = parser.parse_args()
313 | 
314 |     folder_name = './logs/bullet_' + datetime.datetime.now().strftime(
315 |         '%Y-%m-%d_%H-%M-%S') + args.postfix + '/'
316 |     pathlib.Path(folder_name).mkdir(parents=True)
317 |     with open(folder_name + 'args.pkl', 'wb') as f:
318 |         pickle.dump(args, f)
319 | 
320 |     out = Parallel(n_jobs=4)(delayed(experiment)(i, args)
321 |                              for i in range(args.n_exp))
322 | 
323 |     scores = np.array([o[0] for o in out])
324 |     critic_loss = np.array([o[1] for o in out])
325 | 
326 |     np.save(folder_name + 'scores.npy', scores)
327 |     np.save(folder_name + 'critic_loss.npy', critic_loss)
328 | 


--------------------------------------------------------------------------------
/ddpg/run_mujoco.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import pathlib
  4 | import sys
  5 | 
  6 | from joblib import delayed, Parallel
  7 | import numpy as np
  8 | import torch.optim as optim
  9 | 
 10 | import pickle
 11 | 
 12 | sys.path.append('..')
 13 | 
 14 | from mushroom_rl.approximators.parametric import TorchApproximator
 15 | from mushroom_rl.environments import *
 16 | from mushroom_rl.utils.dataset import compute_J
 17 | 
 18 | from core import Core
 19 | from ddpg import DDPG
 20 | from policy import OrnsteinUhlenbeckPolicy
 21 | 
 22 | from networks import ActorNetwork, CriticNetwork
 23 | from losses import LossFunction
 24 | 
 25 | 
 26 | def print_epoch(epoch):
 27 |     print('################################################################')
 28 |     print('Epoch: ', epoch)
 29 |     print('----------------------------------------------------------------')
 30 | 
 31 | 
 32 | def get_stats(dataset, gamma, idx, domains, tasks):
 33 |     J = np.mean(compute_J(dataset, gamma[idx]))
 34 |     print(domains[idx] + '-' + tasks[idx] + ': J: %f' % J)
 35 | 
 36 |     return J
 37 | 
 38 | 
 39 | def experiment(idx, args):
 40 |     np.random.seed()
 41 | 
 42 |     args.games = [''.join(g) for g in args.games]
 43 | 
 44 |     domains = args.games[::2]
 45 |     tasks = args.games[1::2]
 46 | 
 47 |     scores = list()
 48 |     for _ in range(len(domains)):
 49 |         scores.append(list())
 50 | 
 51 |     optimizer_actor = dict()
 52 |     optimizer_actor['class'] = optim.Adam
 53 |     optimizer_actor['params'] = dict(lr=args.learning_rate_actor)
 54 | 
 55 |     optimizer_critic = dict()
 56 |     optimizer_critic['class'] = optim.Adam
 57 |     optimizer_critic['params'] = dict(lr=args.learning_rate_critic,
 58 |                                       weight_decay=1e-2)
 59 | 
 60 |     # MDP
 61 |     mdp = list()
 62 |     gamma_eval = list()
 63 |     for i, g in enumerate(zip(domains, tasks)):
 64 |         mdp.append(DMControl(g[0], g[1], args.horizon[i], args.gamma[i]))
 65 |         gamma_eval.append(args.gamma[i])
 66 |     if args.render:
 67 |         mdp[0].render()
 68 | 
 69 |     n_input_per_mdp = [m.info.observation_space.shape for m in mdp]
 70 |     n_actions_per_head = [(m.info.action_space.shape[0],) for m in mdp]
 71 | 
 72 |     max_obs_dim = 0
 73 |     max_act_n = 0
 74 |     for i in range(len(domains)):
 75 |         n = mdp[i].info.observation_space.shape[0]
 76 |         m = len(mdp[i].info.action_space.shape)
 77 |         if n > max_obs_dim:
 78 |             max_obs_dim = n
 79 |             max_obs_idx = i
 80 |         if m > max_act_n:
 81 |             max_act_n = m
 82 |             max_act_idx = i
 83 |     gammas = [m.info.gamma for m in mdp]
 84 |     horizons = [m.info.horizon for m in mdp]
 85 |     mdp_info = MDPInfo(mdp[max_obs_idx].info.observation_space,
 86 |                        mdp[max_act_idx].info.action_space, gammas, horizons)
 87 |     max_action_value = list()
 88 |     for m in mdp:
 89 |         assert len(np.unique(m.info.action_space.low)) == 1
 90 |         assert len(np.unique(m.info.action_space.high)) == 1
 91 |         assert abs(m.info.action_space.low[0]) == m.info.action_space.high[0]
 92 | 
 93 |         max_action_value.append(m.info.action_space.high[0])
 94 | 
 95 |     # DQN learning run
 96 | 
 97 |     # Settings
 98 |     if args.debug:
 99 |         initial_replay_size = args.batch_size
100 |         max_replay_size = 500
101 |         test_samples = 20
102 |         evaluation_frequency = 50
103 |         max_steps = 1000
104 |     else:
105 |         initial_replay_size = args.initial_replay_size
106 |         max_replay_size = args.max_replay_size
107 |         test_samples = args.test_samples
108 |         evaluation_frequency = args.evaluation_frequency
109 |         max_steps = args.max_steps
110 | 
111 |     # Policy
112 |     policy_class = OrnsteinUhlenbeckPolicy
113 |     policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2,
114 |                          n_actions_per_head=n_actions_per_head,
115 |                          max_action_value=max_action_value)
116 | 
117 |     # Approximator
118 |     n_games = len(args.games)
119 |     loss = LossFunction(n_games, args.batch_size, evaluation_frequency)
120 | 
121 |     actor_approximator = TorchApproximator
122 |     actor_input_shape = [m.info.observation_space.shape for m in mdp]
123 | 
124 |     actor_approximator_params = dict(
125 |         network=ActorNetwork,
126 |         input_shape=actor_input_shape,
127 |         output_shape=(max(n_actions_per_head)[0],),
128 |         n_actions_per_head=n_actions_per_head,
129 |         n_hidden_1=args.hidden_neurons[0],
130 |         n_hidden_2=args.hidden_neurons[1],
131 |         optimizer=optimizer_actor,
132 |         use_cuda=args.use_cuda,
133 |         features=args.features
134 |     )
135 | 
136 |     critic_approximator = TorchApproximator
137 |     critic_input_shape = [m.info.observation_space.shape for m in mdp]
138 |     critic_approximator_params = dict(
139 |         network=CriticNetwork,
140 |         input_shape=critic_input_shape,
141 |         output_shape=(1,),
142 |         n_actions_per_head=n_actions_per_head,
143 |         n_hidden_1=args.hidden_neurons[0],
144 |         n_hidden_2=args.hidden_neurons[1],
145 |         optimizer=optimizer_actor,
146 |         loss=loss,
147 |         use_cuda=args.use_cuda,
148 |         features=args.features
149 |     )
150 | 
151 |     # Agent
152 |     algorithm_params = dict(
153 |         batch_size=args.batch_size,
154 |         initial_replay_size=initial_replay_size,
155 |         max_replay_size=max_replay_size,
156 |         tau=args.tau,
157 |         actor_params=actor_approximator_params,
158 |         critic_params=critic_approximator_params,
159 |         policy_params=policy_params,
160 |         n_games=len(domains),
161 |         n_input_per_mdp=n_input_per_mdp,
162 |         n_actions_per_head=n_actions_per_head,
163 |         dtype=np.float32
164 |     )
165 | 
166 |     agent = DDPG(actor_approximator, critic_approximator, policy_class,
167 |                  mdp_info, **algorithm_params)
168 | 
169 |     # Algorithm
170 |     core = Core(agent, mdp)
171 | 
172 |     # RUN
173 | 
174 |     # Fill replay memory with random dataset
175 |     print_epoch(0)
176 |     core.learn(n_steps=initial_replay_size,
177 |                n_steps_per_fit=initial_replay_size, quiet=args.quiet)
178 | 
179 |     if args.transfer:
180 |         weights = pickle.load(open(args.transfer, 'rb'))
181 |         agent.set_shared_weights(weights)
182 | 
183 |     if args.load:
184 |         weights = np.load(args.load)
185 |         agent.policy.set_weights(weights)
186 | 
187 |     # Evaluate initial policy
188 |     agent.policy.eval = True
189 |     dataset = core.evaluate(n_steps=test_samples, render=args.render,
190 |                             quiet=args.quiet)
191 |     agent.policy.eval = False
192 |     for i in range(len(mdp)):
193 |         d = dataset[i::len(mdp)]
194 |         scores[i].append(get_stats(d, gamma_eval, i, domains, tasks))
195 | 
196 |     if args.unfreeze_epoch > 0:
197 |         agent.freeze_shared_weights()
198 | 
199 |     best_score_sum = -np.inf
200 |     best_weights = None
201 | 
202 |     np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
203 |     np.save(folder_name + 'critic_loss-exp-%d.npy' % idx,
204 |             agent._critic_approximator.model._loss.get_losses())
205 |     for n_epoch in range(1, max_steps // evaluation_frequency + 1):
206 |         if n_epoch >= args.unfreeze_epoch > 0:
207 |             agent.unfreeze_shared_weights()
208 | 
209 |         print_epoch(n_epoch)
210 |         print('- Learning:')
211 |         # learning step
212 |         core.learn(n_steps=evaluation_frequency,
213 |                    n_steps_per_fit=1, quiet=args.quiet)
214 | 
215 |         print('- Evaluation:')
216 |         # evaluation step
217 |         agent.policy.eval = True
218 |         dataset = core.evaluate(n_steps=test_samples,
219 |                                 render=args.render, quiet=args.quiet)
220 |         agent.policy.eval = False
221 | 
222 |         current_score_sum = 0
223 |         for i in range(len(mdp)):
224 |             d = dataset[i::len(mdp)]
225 |             current_score = get_stats(d, gamma_eval, i, domains, tasks)
226 |             scores[i].append(current_score)
227 |             current_score_sum += current_score
228 | 
229 |         # Save shared weights if best score
230 |         if args.save_shared and current_score_sum >= best_score_sum:
231 |             best_score_sum = current_score_sum
232 |             best_weights = agent.get_shared_weights()
233 | 
234 |         if args.save:
235 |             np.save(folder_name + 'best_weights-exp-%d.npy' % idx,
236 |                     agent.policy.get_weights())
237 | 
238 |         np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
239 |         np.save(folder_name + 'critic_loss-exp-%d.npy' % idx,
240 |                 agent._critic_approximator.model._loss.get_losses())
241 | 
242 |     if args.save_shared:
243 |         pickle.dump(best_weights, open(args.save_shared, 'wb'))
244 | 
245 |     return scores, agent._critic_approximator.model._loss.get_losses()
246 | 
247 | 
248 | if __name__ == '__main__':
249 |     # Argument parser
250 |     parser = argparse.ArgumentParser()
251 | 
252 |     arg_game = parser.add_argument_group('Game')
253 |     arg_game.add_argument("--games", type=list, nargs='+',
254 |                           default=['cartpole', 'swingup'])
255 |     arg_game.add_argument("--horizon", type=int, nargs='+')
256 |     arg_game.add_argument("--gamma", type=float, nargs='+')
257 |     arg_game.add_argument("--n-exp", type=int)
258 | 
259 |     arg_mem = parser.add_argument_group('Replay Memory')
260 |     arg_mem.add_argument("--initial-replay-size", type=int, default=64,
261 |                          help='Initial size of the replay memory.')
262 |     arg_mem.add_argument("--max-replay-size", type=int, default=50000,
263 |                          help='Max size of the replay memory.')
264 | 
265 |     arg_net = parser.add_argument_group('Deep Q-Network')
266 |     arg_net.add_argument("--hidden-neurons", type=int, nargs=2,
267 |                          default=[600, 500])
268 |     arg_net.add_argument("--learning-rate-actor", type=float, default=1e-4,
269 |                          help='Learning rate value of the optimizer. Only used'
270 |                               'in rmspropcentered')
271 |     arg_net.add_argument("--learning-rate-critic", type=float, default=1e-3,
272 |                          help='Learning rate value of the optimizer. Only used'
273 |                               'in rmspropcentered')
274 | 
275 |     arg_alg = parser.add_argument_group('Algorithm')
276 |     arg_alg.add_argument("--features", choices=['relu', 'sigmoid'])
277 |     arg_alg.add_argument("--batch-size", type=int, default=64,
278 |                          help='Batch size for each fit of the network.')
279 |     arg_alg.add_argument("--tau", type=float, default=1e-3)
280 |     arg_alg.add_argument("--history-length", type=int, default=1,
281 |                          help='Number of frames composing a state.')
282 |     arg_alg.add_argument("--evaluation-frequency", type=int, default=10000,
283 |                          help='Number of learning step before each evaluation.'
284 |                               'This number represents an epoch.')
285 |     arg_alg.add_argument("--max-steps", type=int, default=1000000,
286 |                          help='Total number of learning steps.')
287 |     arg_alg.add_argument("--test-samples", type=int, default=5000,
288 |                          help='Number of steps for each evaluation.')
289 |     arg_alg.add_argument("--transfer", type=str, default='',
290 |                          help='Path to  the file of the weights of the common '
291 |                               'layers to be loaded')
292 |     arg_alg.add_argument("--save-shared", type=str, default='',
293 |                          help='filename where to save the shared weights')
294 |     arg_alg.add_argument("--unfreeze-epoch", type=int, default=0,
295 |                          help="Number of epoch where to unfreeze shared weights.")
296 | 
297 |     arg_utils = parser.add_argument_group('Utils')
298 |     arg_utils.add_argument('--use-cuda', action='store_true',
299 |                            help='Flag specifying whether to use the GPU.')
300 |     arg_utils.add_argument('--load', type=str,
301 |                            help='Path of the model to be loaded.')
302 |     arg_utils.add_argument('--save', action='store_true',
303 |                            help='Flag specifying whether to save the model.')
304 |     arg_utils.add_argument('--render', action='store_true',
305 |                            help='Flag specifying whether to render the game.')
306 |     arg_utils.add_argument('--quiet', action='store_true',
307 |                            help='Flag specifying whether to hide the progress'
308 |                                 'bar.')
309 |     arg_utils.add_argument('--debug', action='store_true',
310 |                            help='Flag specifying whether the script has to be'
311 |                                 'run in debug mode.')
312 |     arg_utils.add_argument('--postfix', type=str, default='',
313 |                            help='Flag used to add a postfix to the folder name')
314 | 
315 |     args = parser.parse_args()
316 | 
317 |     folder_name = './logs/mujoco_' + datetime.datetime.now().strftime(
318 |         '%Y-%m-%d_%H-%M-%S') + args.postfix + '/'
319 |     pathlib.Path(folder_name).mkdir(parents=True)
320 | 
321 |     with open(folder_name + 'args.pkl', 'wb') as f:
322 |         pickle.dump(args, f)
323 | 
324 |     out = Parallel(n_jobs=-1)(delayed(experiment)(i, args)
325 |                               for i in range(args.n_exp))
326 | 
327 |     scores = np.array([o[0] for o in out])
328 |     critic_loss = np.array([o[1] for o in out])
329 | 
330 |     np.save(folder_name + 'scores.npy', scores)
331 |     np.save(folder_name + 'critic_loss_raw.npy', critic_loss)
332 | 


--------------------------------------------------------------------------------
/dqn/dqn.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | from mushroom_rl.core.agent import Agent
  7 | from mushroom_rl.approximators.regressor import Regressor
  8 | from mushroom_rl.approximators.parametric.torch_approximator import *
  9 | 
 10 | from replay_memory import PrioritizedReplayMemory, ReplayMemory
 11 | 
 12 | 
 13 | class DQN(Agent):
 14 |     """
 15 |     Deep Q-Network algorithm.
 16 |     "Human-Level Control Through Deep Reinforcement Learning".
 17 |     Mnih V. et al.. 2015.
 18 | 
 19 |     """
 20 |     def __init__(self, approximator, policy, mdp_info, batch_size,
 21 |                  initial_replay_size, max_replay_size, n_actions_per_head,
 22 |                  history_length=4, n_input_per_mdp=None, replay_memory=None,
 23 |                  target_update_frequency=2500, fit_params=None,
 24 |                  approximator_params=None, n_games=1, clip_reward=True,
 25 |                  dtype=np.uint8):
 26 |         self._fit_params = dict() if fit_params is None else fit_params
 27 | 
 28 |         self._batch_size = batch_size
 29 |         self._n_games = n_games
 30 |         self._clip_reward = clip_reward
 31 |         if n_input_per_mdp is None:
 32 |             self._n_input_per_mdp = [mdp_info.observation_space.shape
 33 |                                      for _ in range(self._n_games)]
 34 |         else:
 35 |             self._n_input_per_mdp = n_input_per_mdp
 36 |         self._n_action_per_head = n_actions_per_head
 37 |         self._history_length = history_length
 38 |         self._max_actions = max(n_actions_per_head)[0]
 39 |         self._target_update_frequency = target_update_frequency
 40 | 
 41 |         if replay_memory is not None:
 42 |             self._replay_memory = replay_memory
 43 |             if isinstance(replay_memory[0], PrioritizedReplayMemory):
 44 |                 self._fit = self._fit_prioritized
 45 |             else:
 46 |                 self._fit = self._fit_standard
 47 |         else:
 48 |             self._replay_memory = [ReplayMemory(
 49 |                 initial_replay_size, max_replay_size) for _ in range(self._n_games)
 50 |             ]
 51 |             self._fit = self._fit_standard
 52 | 
 53 |         self._n_updates = 0
 54 | 
 55 |         apprx_params_train = deepcopy(approximator_params)
 56 |         apprx_params_target = deepcopy(approximator_params)
 57 |         self.approximator = Regressor(approximator, **apprx_params_train)
 58 |         self.target_approximator = Regressor(approximator,
 59 |                                              **apprx_params_target)
 60 |         policy.set_q(self.approximator)
 61 | 
 62 |         self.target_approximator.model.set_weights(
 63 |             self.approximator.model.get_weights())
 64 | 
 65 |         super().__init__(mdp_info, policy)
 66 | 
 67 |         n_samples = self._batch_size * self._n_games
 68 |         self._state_idxs = np.zeros(n_samples, dtype=np.int)
 69 |         self._state = np.zeros(
 70 |             ((n_samples,
 71 |              self._history_length) + self.mdp_info.observation_space.shape),
 72 |             dtype=dtype
 73 |         ).squeeze()
 74 |         self._action = np.zeros((n_samples, 1), dtype=np.int)
 75 |         self._reward = np.zeros(n_samples)
 76 |         self._next_state_idxs = np.zeros(n_samples, dtype=np.int)
 77 |         self._next_state = np.zeros(
 78 |             ((n_samples,
 79 |              self._history_length) + self.mdp_info.observation_space.shape),
 80 |             dtype=dtype
 81 |         ).squeeze()
 82 |         self._absorbing = np.zeros(n_samples)
 83 |         self._idxs = np.zeros(n_samples, dtype=np.int)
 84 |         self._is_weight = np.zeros(n_samples)
 85 | 
 86 |     def fit(self, dataset):
 87 |         self._fit(dataset)
 88 | 
 89 |         self._n_updates += 1
 90 |         if self._n_updates % self._target_update_frequency == 0:
 91 |             self._update_target()
 92 | 
 93 |     def _fit_standard(self, dataset):
 94 |         s = np.array([d[0][0] for d in dataset]).ravel()
 95 |         games = np.unique(s)
 96 |         for g in games:
 97 |             idxs = np.argwhere(s == g).ravel()
 98 |             d = list()
 99 |             for idx in idxs:
100 |                 d.append(dataset[idx])
101 | 
102 |             self._replay_memory[g].add(d)
103 | 
104 |         fit_condition = np.all([rm.initialized for rm in self._replay_memory])
105 | 
106 |         if fit_condition:
107 |             for i in range(len(self._replay_memory)):
108 |                 game_state, game_action, game_reward, game_next_state,\
109 |                     game_absorbing, _ = self._replay_memory[i].get(
110 |                         self._batch_size)
111 | 
112 |                 start = self._batch_size * i
113 |                 stop = start + self._batch_size
114 | 
115 |                 self._state_idxs[start:stop] = np.ones(self._batch_size) * i
116 |                 self._state[start:stop, :self._n_input_per_mdp[i][0]] = game_state
117 |                 self._action[start:stop] = game_action
118 |                 self._reward[start:stop] = game_reward
119 |                 self._next_state_idxs[start:stop] = np.ones(self._batch_size) * i
120 |                 self._next_state[start:stop, :self._n_input_per_mdp[i][0]] = game_next_state
121 |                 self._absorbing[start:stop] = game_absorbing
122 | 
123 |             if self._clip_reward:
124 |                 reward = np.clip(self._reward, -1, 1)
125 |             else:
126 |                 reward = self._reward
127 | 
128 |             q_next = self._next_q()
129 |             q = reward + q_next
130 | 
131 |             self.approximator.fit(self._state, self._action, q,
132 |                                   idx=self._state_idxs, **self._fit_params)
133 | 
134 |     def _fit_prioritized(self, dataset):
135 |         s = np.array([d[0][0] for d in dataset]).ravel()
136 |         games = np.unique(s)
137 |         for g in games:
138 |             idxs = np.argwhere(s == g).ravel()
139 |             d = list()
140 |             for idx in idxs:
141 |                 d.append(dataset[idx])
142 | 
143 |             self._replay_memory[g].add(
144 |                 d, np.ones(len(d)) * self._replay_memory[g].max_priority
145 |             )
146 | 
147 |         fit_condition = np.all([rm.initialized for rm in self._replay_memory])
148 | 
149 |         if fit_condition:
150 |             for i in range(len(self._replay_memory)):
151 |                 game_state, game_action, game_reward, game_next_state,\
152 |                     game_absorbing, _, game_idxs, game_is_weight =\
153 |                     self._replay_memory[i].get(self._batch_size)
154 | 
155 |                 start = self._batch_size * i
156 |                 stop = start + self._batch_size
157 | 
158 |                 self._state_idxs[start:stop] = np.ones(self._batch_size) * i
159 |                 self._state[start:stop, :self._n_input_per_mdp[i][0]] = game_state
160 |                 self._action[start:stop] = game_action
161 |                 self._reward[start:stop] = game_reward
162 |                 self._next_state_idxs[start:stop] = np.ones(self._batch_size) * i
163 |                 self._next_state[start:stop, :self._n_input_per_mdp[i][0]] = game_next_state
164 |                 self._absorbing[start:stop] = game_absorbing
165 |                 self._idxs[start:stop] = game_idxs
166 |                 self._is_weight[start:stop] = game_is_weight
167 | 
168 |             if self._clip_reward:
169 |                 reward = np.clip(self._reward, -1, 1)
170 |             else:
171 |                 reward = self._reward
172 | 
173 |             q_next = self._next_q()
174 |             q = reward + q_next
175 |             q_current = self.approximator.predict(self._state, self._action,
176 |                                                   idx=self._state_idxs)
177 |             td_error = q - q_current
178 | 
179 |             for er in self._replay_memory:
180 |                 er.update(td_error, self._idxs)
181 | 
182 |             self.approximator.fit(self._state, self._action, q,
183 |                                   weights=self._is_weight,
184 |                                   idx=self._state_idxs,
185 |                                   **self._fit_params)
186 | 
187 |     def get_shared_weights(self):
188 |         return self.approximator.model.network.get_shared_weights()
189 | 
190 |     def set_shared_weights(self, weights):
191 |         self.approximator.model.network.set_shared_weights(weights)
192 | 
193 |     def freeze_shared_weights(self):
194 |         self.approximator.model.network.freeze_shared_weights()
195 | 
196 |     def unfreeze_shared_weights(self):
197 |         self.approximator.model.network.unfreeze_shared_weights()
198 | 
199 |     def _update_target(self):
200 |         """
201 |         Update the target network.
202 | 
203 |         """
204 |         self.target_approximator.model.set_weights(
205 |             self.approximator.model.get_weights())
206 | 
207 |     def _next_q(self):
208 |         q = self.target_approximator.predict(self._next_state,
209 |                                              idx=self._next_state_idxs)
210 | 
211 |         out_q = np.zeros(self._batch_size * self._n_games)
212 | 
213 |         for i in range(self._n_games):
214 |             start = self._batch_size * i
215 |             stop = start + self._batch_size
216 |             if np.any(self._absorbing[start:stop]):
217 |                 q[start:stop] *= 1 - self._absorbing[start:stop].reshape(-1, 1)
218 | 
219 |             n_actions = self._n_action_per_head[i][0]
220 |             out_q[start:stop] = np.max(q[start:stop, :n_actions], axis=1)
221 |             out_q[start:stop] *= self.mdp_info.gamma[i]
222 | 
223 |         return out_q
224 | 
225 | 
226 | class DoubleDQN(DQN):
227 |     """
228 |     Double DQN algorithm.
229 |     "Deep Reinforcement Learning with Double Q-Learning".
230 |     Hasselt H. V. et al.. 2016.
231 | 
232 |     """
233 |     def _next_q(self):
234 |         q = self.approximator.predict(self._next_state,
235 |                                       idx=self._next_state_idxs)
236 |         out_q = np.zeros(self._batch_size * self._n_games)
237 | 
238 |         for i in range(self._n_games):
239 |             start = self._batch_size * i
240 |             stop = start + self._batch_size
241 |             n_actions = self._n_action_per_head[i][0]
242 |             max_a = np.argmax(q[start:stop, :n_actions], axis=1)
243 | 
244 |             double_q = self.target_approximator.predict(
245 |                 self._next_state[start:stop], max_a,
246 |                 idx=self._next_state_idxs[start:stop]
247 |             )
248 |             if np.any(self._absorbing[start:stop]):
249 |                 double_q *= 1 - self._absorbing[start:stop].reshape(-1, 1)
250 | 
251 |             out_q[start:stop] = double_q * self.mdp_info.gamma[i]
252 | 
253 |         return out_q
254 | 


--------------------------------------------------------------------------------
/dqn/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class LossFunction(object):
 6 |     def __init__(self, n_games, batch_size, eval_frequency):
 7 |         self._n_games = n_games
 8 |         self._batch_size = batch_size
 9 |         self._eval_frequency = eval_frequency
10 | 
11 |         self._losses = list()
12 |         self._reg_losses = list()
13 |         self._counter = 0
14 | 
15 |     def get_losses(self):
16 |         return self._losses
17 | 
18 |     def get_reg_losses(self):
19 |         return self._reg_losses
20 | 
21 |     def __call__(self, yhat, y, reduction='mean'):
22 |         loss = F.smooth_l1_loss(yhat, y, reduce=False)
23 | 
24 |         if self._need_log():
25 |             temp_losses = list()
26 | 
27 |             for i in range(self._n_games):
28 |                 start = i * self._batch_size
29 |                 stop = start + self._batch_size
30 |                 temp_losses.append(torch.mean(loss[start:stop]).item())
31 | 
32 |             self._losses.append(temp_losses)
33 | 
34 |         if reduction is 'none':
35 |             return loss
36 |         elif reduction is 'mean':
37 |             return loss.mean()
38 |         else:
39 |             raise NotImplementedError
40 | 
41 |     def _need_log(self):
42 |         self._counter += 1
43 |         if self._counter >= self._eval_frequency:
44 |             self._counter = 0
45 |             return True
46 |         else:
47 |             return False
48 | 


--------------------------------------------------------------------------------
/dqn/networks.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | import numpy as np
  6 | 
  7 | 
  8 | class AtariNetwork(nn.Module):
  9 |     n_features = 512
 10 | 
 11 |     def __init__(self, input_shape, _, n_actions_per_head, use_cuda, n_games,
 12 |                  features, dropout):
 13 |         super().__init__()
 14 | 
 15 |         self._n_input = input_shape
 16 |         self._n_games = n_games
 17 |         self._max_actions = max(n_actions_per_head)[0]
 18 |         self._features = features
 19 |         self._use_cuda = use_cuda
 20 |         self._n_shared = 2
 21 | 
 22 |         self._h1 = nn.ModuleList(
 23 |             [nn.Conv2d(self._n_input[0], 32, kernel_size=8, stride=4) for _ in range(
 24 |                 self._n_games)]
 25 |         )
 26 |         self._h2 = nn.ModuleList(
 27 |             [nn.Conv2d(32, 64, kernel_size=4, stride=2) for _ in range(
 28 |                 self._n_games)]
 29 |         )
 30 |         self._h3 = nn.ModuleList(
 31 |             [nn.Conv2d(64, 64, kernel_size=3, stride=1) for _ in range(
 32 |                 self._n_games)]
 33 |         )
 34 |         self._h4 = nn.Linear(3136, self.n_features)
 35 |         self._h5 = nn.ModuleList(
 36 |             [nn.Linear(self.n_features, self._max_actions) for _ in range(
 37 |                 self._n_games)]
 38 |         )
 39 | 
 40 |         nn.init.xavier_uniform_(self._h4.weight,
 41 |                                 gain=nn.init.calculate_gain('relu'))
 42 |         for i in range(self._n_games):
 43 |             nn.init.xavier_uniform_(self._h1[i].weight,
 44 |                                     gain=nn.init.calculate_gain('relu'))
 45 |             nn.init.xavier_uniform_(self._h2[i].weight,
 46 |                                     gain=nn.init.calculate_gain('relu'))
 47 |             nn.init.xavier_uniform_(self._h3[i].weight,
 48 |                                     gain=nn.init.calculate_gain('relu'))
 49 |             nn.init.xavier_uniform_(self._h5[i].weight,
 50 |                                     gain=nn.init.calculate_gain('linear'))
 51 | 
 52 |     def forward(self, state, action=None, idx=None):
 53 |         state = state.float() / 255.
 54 | 
 55 |         h = list()
 56 |         for i in np.unique(idx):
 57 |             idxs = np.argwhere(idx == i).ravel()
 58 |             h_f = F.relu(
 59 |                 self._h1[i](state[idxs, :self._n_input[0]])
 60 |             )
 61 |             h_f = F.relu(self._h2[i](h_f))
 62 |             h.append(F.relu(self._h3[i](h_f)))
 63 |         cat_h3 = torch.cat(h)
 64 | 
 65 |         if self._features == 'relu':
 66 |             h_f = F.relu(self._h4(cat_h3.view(-1, 3136)))
 67 |         elif self._features == 'sigmoid':
 68 |             h_f = torch.sigmoid(self._h4(cat_h3.view(-1, 3136)))
 69 |         else:
 70 |             raise ValueError
 71 | 
 72 |         q = [self._h5[i](h_f) for i in range(self._n_games)]
 73 |         q = torch.stack(q, dim=1)
 74 | 
 75 |         if action is not None:
 76 |             action = action.long()
 77 |             q_acted = torch.squeeze(
 78 |                 q.gather(2, action.repeat(1, self._n_games).unsqueeze(-1)), -1)
 79 | 
 80 |             q = q_acted
 81 | 
 82 |         if idx is not None:
 83 |             idx = torch.from_numpy(idx)
 84 |             if self._use_cuda:
 85 |                 idx = idx.cuda()
 86 |             if q.dim() == 2:
 87 |                 q_idx = q.gather(1, idx.unsqueeze(-1))
 88 |             else:
 89 |                 q_idx = q.gather(1, idx.view(-1, 1).repeat(
 90 |                     1, self._max_actions).unsqueeze(1))
 91 | 
 92 |             q = torch.squeeze(q_idx, 1)
 93 | 
 94 |         return q
 95 | 
 96 |     def get_shared_weights(self):
 97 |         p1 = list()
 98 | 
 99 |         for p in self._h4.parameters():
100 |             p1.append(p.data.detach().cpu().numpy())
101 | 
102 |         return p1
103 | 
104 |     def set_shared_weights(self, weights):
105 |         w1 = weights
106 | 
107 |         for p, w in zip(self._h4.parameters(), w1):
108 |             w_tensor = torch.from_numpy(w).type(p.data.dtype)
109 |             if self._use_cuda:
110 |                 w_tensor = w_tensor.cuda()
111 |             p.data = w_tensor
112 | 
113 |     def freeze_shared_weights(self):
114 |         for p in self._h4.parameters():
115 |             p.requires_grad = False
116 | 
117 |     def unfreeze_shared_weights(self):
118 |         for p in self._h4.parameters():
119 |             p.requires_grad = True
120 | 
121 | 
122 | class GymNetwork(nn.Module):
123 |     def __init__(self, input_shape, _, n_actions_per_head, use_cuda, features,
124 |                  dropout, n_features=80):
125 |         super().__init__()
126 | 
127 |         self._n_input = input_shape
128 |         self._n_games = len(n_actions_per_head)
129 |         self._max_actions = max(n_actions_per_head)[0]
130 |         self._use_cuda = use_cuda
131 |         self._n_shared = 4
132 |         self._features = features
133 | 
134 |         self._h1 = nn.ModuleList(
135 |             [nn.Linear(self._n_input[i][0], n_features) for i in range(
136 |                 len(input_shape))]
137 |         )
138 |         self._h2 = nn.Linear(n_features, n_features)
139 |         self._h3 = nn.Linear(n_features, n_features)
140 |         self._h4 = nn.ModuleList(
141 |             [nn.Linear(n_features, self._max_actions) for _ in range(
142 |                 self._n_games)]
143 |         )
144 | 
145 |         nn.init.xavier_uniform_(self._h2.weight,
146 |                                 gain=nn.init.calculate_gain('relu'))
147 |         nn.init.xavier_uniform_(self._h3.weight,
148 |                                 gain=nn.init.calculate_gain('relu'))
149 |         for i in range(self._n_games):
150 |             nn.init.xavier_uniform_(self._h1[i].weight,
151 |                                     gain=nn.init.calculate_gain('relu'))
152 |             nn.init.xavier_uniform_(self._h4[i].weight,
153 |                                     gain=nn.init.calculate_gain('linear'))
154 | 
155 |     def forward(self, state, action=None, idx=None):
156 |         state = state.float()
157 | 
158 |         h1 = list()
159 |         for i in np.unique(idx):
160 |             idxs = np.argwhere(idx == i).ravel()
161 |             h1.append(F.relu(self._h1[i](state[idxs, :self._n_input[i][0]])))
162 |         cat_h1 = torch.cat(h1)
163 | 
164 |         h_f = F.relu(self._h2(cat_h1))
165 | 
166 |         if self._features == 'relu':
167 |             h_f = F.relu(self._h3(h_f))
168 |         elif self._features == 'sigmoid':
169 |             h_f = torch.sigmoid(self._h3(h_f))
170 |         else:
171 |             raise ValueError
172 | 
173 |         q = [self._h4[i](h_f) for i in range(self._n_games)]
174 |         q = torch.stack(q, dim=1)
175 | 
176 |         if action is not None:
177 |             action = action.long()
178 |             q_acted = torch.squeeze(
179 |                 q.gather(2, action.repeat(1, self._n_games).unsqueeze(-1)), -1)
180 | 
181 |             q = q_acted
182 | 
183 |         if idx is not None:
184 |             idx = torch.from_numpy(idx)
185 |             if self._use_cuda:
186 |                 idx = idx.cuda()
187 |             if q.dim() == 2:
188 |                 q_idx = q.gather(1, idx.unsqueeze(-1))
189 |             else:
190 |                 q_idx = q.gather(1, idx.view(-1, 1).repeat(
191 |                     1, self._max_actions).unsqueeze(1))
192 | 
193 |             q = torch.squeeze(q_idx, 1)
194 | 
195 |         return q
196 | 
197 |     def get_shared_weights(self):
198 |         p2 = list()
199 |         p3 = list()
200 | 
201 |         for p in self._h2.parameters():
202 |             p2.append(p.data.detach().cpu().numpy())
203 | 
204 |         for p in self._h3.parameters():
205 |             p3.append(p.data.detach().cpu().numpy())
206 | 
207 |         return p2, p3
208 | 
209 |     def set_shared_weights(self, weights):
210 |         w2, w3 = weights
211 | 
212 |         for p, w in zip(self._h2.parameters(), w2):
213 |             w_tensor = torch.from_numpy(w).type(p.data.dtype)
214 |             if self._use_cuda:
215 |                 w_tensor = w_tensor.cuda()
216 |             p.data = w_tensor
217 | 
218 |         for p, w in zip(self._h3.parameters(), w3):
219 |             w_tensor = torch.from_numpy(w).type(p.data.dtype)
220 |             if self._use_cuda:
221 |                 w_tensor = w_tensor.cuda()
222 |             p.data = w_tensor
223 | 
224 |     def freeze_shared_weights(self):
225 |         for p in self._h2.parameters():
226 |             p.requires_grad = False
227 |         for p in self._h3.parameters():
228 |             p.requires_grad = False
229 | 
230 |     def unfreeze_shared_weights(self):
231 |         for p in self._h2.parameters():
232 |             p.requires_grad = True
233 |         for p in self._h3.parameters():
234 |             p.requires_grad = True
235 | 


--------------------------------------------------------------------------------
/dqn/run_atari.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import pathlib
  4 | import sys
  5 | 
  6 | from joblib import delayed, Parallel
  7 | import numpy as np
  8 | import torch.optim as optim
  9 | 
 10 | import pickle
 11 | 
 12 | sys.path.append('..')
 13 | 
 14 | from mushroom_rl.approximators.parametric.torch_approximator import TorchApproximator
 15 | from mushroom_rl.environments import *
 16 | from mushroom_rl.utils.dataset import compute_metrics
 17 | from mushroom_rl.utils.parameters import LinearParameter, Parameter
 18 | 
 19 | from core import Core
 20 | from dqn import DQN, DoubleDQN
 21 | from policy import EpsGreedyMultiple
 22 | from networks import AtariNetwork
 23 | from losses import LossFunction
 24 | from replay_memory import PrioritizedReplayMemory
 25 | 
 26 | """
 27 | This script runs Atari experiments with DQN as presented in:
 28 | "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015.
 29 | 
 30 | """
 31 | 
 32 | 
 33 | def print_epoch(epoch):
 34 |     print('################################################################')
 35 |     print('Epoch: ', epoch)
 36 |     print('----------------------------------------------------------------')
 37 | 
 38 | 
 39 | def get_stats(dataset, idx, games):
 40 |     score = compute_metrics(dataset)
 41 |     print(games[idx] + ': min_reward: %f, max_reward: %f, mean_reward: %f,'
 42 |           ' games_completed: %d' % score)
 43 | 
 44 |     return score
 45 | 
 46 | 
 47 | def experiment(args, idx):
 48 |     np.random.seed()
 49 | 
 50 |     args.games = [''.join(g) for g in args.games]
 51 | 
 52 |     # MDP
 53 |     mdp = list()
 54 |     for i, g in enumerate(args.games):
 55 |         mdp.append(Atari(g))
 56 | 
 57 |     n_actions_per_head = [(m.info.action_space.n,) for m in mdp]
 58 | 
 59 |     max_obs_dim = 0
 60 |     max_act_n = 0
 61 |     for i in range(len(args.games)):
 62 |         n = mdp[i].info.observation_space.shape[0]
 63 |         m = mdp[i].info.action_space.n
 64 |         if n > max_obs_dim:
 65 |             max_obs_dim = n
 66 |             max_obs_idx = i
 67 |         if m > max_act_n:
 68 |             max_act_n = m
 69 |             max_act_idx = i
 70 |     gammas = [m.info.gamma for m in mdp]
 71 |     horizons = [m.info.horizon for m in mdp]
 72 |     mdp_info = MDPInfo(mdp[max_obs_idx].info.observation_space,
 73 |                        mdp[max_act_idx].info.action_space, gammas, horizons)
 74 | 
 75 |     scores = list()
 76 |     for _ in range(len(args.games)):
 77 |         scores.append(list())
 78 | 
 79 |     optimizer = dict()
 80 |     if args.optimizer == 'adam':
 81 |         optimizer['class'] = optim.Adam
 82 |         optimizer['params'] = dict(lr=args.learning_rate,
 83 |                                    eps=args.epsilon)
 84 |     elif args.optimizer == 'adadelta':
 85 |         optimizer['class'] = optim.Adadelta
 86 |         optimizer['params'] = dict(lr=args.learning_rate,
 87 |                                    eps=args.epsilon)
 88 |     elif args.optimizer == 'rmsprop':
 89 |         optimizer['class'] = optim.RMSprop
 90 |         optimizer['params'] = dict(lr=args.learning_rate,
 91 |                                    alpha=args.decay,
 92 |                                    eps=args.epsilon)
 93 |     elif args.optimizer == 'rmspropcentered':
 94 |         optimizer['class'] = optim.RMSprop
 95 |         optimizer['params'] = dict(lr=args.learning_rate,
 96 |                                    alpha=args.decay,
 97 |                                    eps=args.epsilon,
 98 |                                    centered=True)
 99 |     else:
100 |         raise ValueError
101 | 
102 |     # DQN learning run
103 | 
104 |     # Settings
105 |     if args.debug:
106 |         initial_replay_size = args.batch_size
107 |         max_replay_size = 500
108 |         train_frequency = 5
109 |         target_update_frequency = 10
110 |         test_samples = 20
111 |         evaluation_frequency = 50
112 |         max_steps = 1000
113 |     else:
114 |         initial_replay_size = args.initial_replay_size
115 |         max_replay_size = args.max_replay_size
116 |         train_frequency = args.train_frequency
117 |         target_update_frequency = args.target_update_frequency
118 |         test_samples = args.test_samples
119 |         evaluation_frequency = args.evaluation_frequency
120 |         max_steps = args.max_steps
121 | 
122 |     # Policy
123 |     epsilon = LinearParameter(value=args.initial_exploration_rate,
124 |                               threshold_value=args.final_exploration_rate,
125 |                               n=args.final_exploration_frame)
126 |     epsilon_test = Parameter(value=args.test_exploration_rate)
127 |     epsilon_random = Parameter(value=1)
128 |     pi = EpsGreedyMultiple(parameter=epsilon,
129 |                            n_actions_per_head=n_actions_per_head)
130 | 
131 |     # Approximator
132 |     n_games = len(args.games)
133 |     loss = LossFunction(n_games, args.batch_size,
134 |                         args.evaluation_frequency)
135 | 
136 |     input_shape = (args.history_length, args.screen_height,
137 |                    args.screen_width)
138 |     approximator_params = dict(
139 |         network=AtariNetwork,
140 |         input_shape=input_shape,
141 |         output_shape=(max(n_actions_per_head)[0],),
142 |         n_actions=max(n_actions_per_head)[0],
143 |         n_actions_per_head=n_actions_per_head,
144 |         n_games=len(args.games),
145 |         optimizer=optimizer,
146 |         loss=loss,
147 |         use_cuda=args.use_cuda,
148 |         features=args.features
149 |     )
150 | 
151 |     approximator = TorchApproximator
152 | 
153 |     if args.prioritized:
154 |         replay_memory = [PrioritizedReplayMemory(
155 |             initial_replay_size, max_replay_size, alpha=.6,
156 |             beta=LinearParameter(.4, threshold_value=1,
157 |                                  n=max_steps // train_frequency)
158 |         ) for _ in range(n_games)]
159 |     else:
160 |         replay_memory = None
161 | 
162 |     # Agent
163 |     algorithm_params = dict(
164 |         batch_size=args.batch_size,
165 |         n_games=len(args.games),
166 |         initial_replay_size=initial_replay_size,
167 |         max_replay_size=max_replay_size,
168 |         target_update_frequency=target_update_frequency // train_frequency,
169 |         replay_memory=replay_memory,
170 |         n_actions_per_head=n_actions_per_head,
171 |         clip_reward=True,
172 |         history_length=args.history_length
173 |     )
174 | 
175 |     if args.algorithm == 'dqn':
176 |         agent = DQN(approximator, pi, mdp_info,
177 |                     approximator_params=approximator_params,
178 |                     **algorithm_params)
179 |     elif args.algorithm == 'ddqn':
180 |         agent = DoubleDQN(approximator, pi, mdp_info,
181 |                           approximator_params=approximator_params,
182 |                           **algorithm_params)
183 | 
184 |     # Algorithm
185 |     core = Core(agent, mdp)
186 | 
187 |     # RUN
188 | 
189 |     # Fill replay memory with random dataset
190 |     print_epoch(0)
191 |     pi.set_parameter(epsilon_random)
192 |     core.learn(n_steps=initial_replay_size,
193 |                n_steps_per_fit=initial_replay_size, quiet=args.quiet)
194 | 
195 |     if args.transfer:
196 |         weights = pickle.load(open(args.transfer, 'rb'))
197 |         agent.set_shared_weights(weights)
198 | 
199 |     if args.load:
200 |         weights = np.load(args.load)
201 |         agent.approximator.set_weights(weights)
202 | 
203 |     # Evaluate initial policy
204 |     pi.set_parameter(epsilon_test)
205 |     dataset = core.evaluate(n_steps=test_samples, render=args.render,
206 |                             quiet=args.quiet)
207 |     for i in range(len(mdp)):
208 |         d = dataset[i::len(mdp)]
209 |         scores[i].append(get_stats(d, i, args.games)[2])
210 | 
211 |     if args.unfreeze_epoch > 0:
212 |         agent.freeze_shared_weights()
213 | 
214 |     best_score_sum = -np.inf
215 |     best_weights = None
216 | 
217 |     np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
218 |     np.save(folder_name + 'loss-exp-%d.npy' % idx,
219 |             agent.approximator.model._loss.get_losses())
220 | 
221 |     for n_epoch in range(1, max_steps // evaluation_frequency + 1):
222 |         if n_epoch >= args.unfreeze_epoch > 0:
223 |             agent.unfreeze_shared_weights()
224 | 
225 |         print_epoch(n_epoch)
226 |         print('- Learning:')
227 |         # learning step
228 |         pi.set_parameter(None)
229 |         core.learn(n_steps=evaluation_frequency,
230 |                    n_steps_per_fit=train_frequency, quiet=args.quiet)
231 | 
232 |         print('- Evaluation:')
233 |         # evaluation step
234 |         pi.set_parameter(epsilon_test)
235 |         dataset = core.evaluate(n_steps=test_samples,
236 |                                 render=args.render, quiet=args.quiet)
237 | 
238 |         current_score_sum = 0
239 |         for i in range(len(mdp)):
240 |             d = dataset[i::len(mdp)]
241 |             current_score = get_stats(d, i, args.games)[2]
242 |             scores[i].append(current_score)
243 |             current_score_sum += current_score
244 | 
245 |         # Save shared weights if best score
246 |         if args.save_shared and current_score_sum >= best_score_sum:
247 |             best_score_sum = current_score_sum
248 |             best_weights = agent.get_shared_weights()
249 | 
250 |         if args.save:
251 |             np.save(folder_name + 'weights-exp-%d-%d.npy' % (idx, n_epoch),
252 |                     agent.approximator.get_weights())
253 | 
254 |         np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
255 |         np.save(folder_name + 'loss-exp-%d.npy' % idx,
256 |                 agent.approximator.model._loss.get_losses())
257 | 
258 |     if args.save_shared:
259 |         pickle.dump(best_weights, open(args.save_shared, 'wb'))
260 | 
261 |     return scores, agent.approximator.model._loss.get_losses()
262 | 
263 | 
264 | if __name__ == '__main__':
265 |     # Argument parser
266 |     parser = argparse.ArgumentParser()
267 | 
268 |     arg_game = parser.add_argument_group('Game')
269 |     arg_game.add_argument("--games",
270 |                           type=list,
271 |                           nargs='+',
272 |                           default=['BreakoutNoFrameskip-v4'],
273 |                           help='Gym ID of the problem.')
274 |     arg_game.add_argument("--screen-width", type=int, default=84,
275 |                           help='Width of the game screen.')
276 |     arg_game.add_argument("--screen-height", type=int, default=84,
277 |                           help='Height of the game screen.')
278 |     arg_game.add_argument("--n-exp", type=int)
279 | 
280 |     arg_mem = parser.add_argument_group('Replay Memory')
281 |     arg_mem.add_argument("--initial-replay-size", type=int, default=50000,
282 |                          help='Initial size of the replay memory.')
283 |     arg_mem.add_argument("--max-replay-size", type=int, default=500000,
284 |                          help='Max size of the replay memory.')
285 |     arg_mem.add_argument("--prioritized", action='store_true',
286 |                          help='Whether to use prioritized memory or not.')
287 | 
288 |     arg_net = parser.add_argument_group('Deep Q-Network')
289 |     arg_net.add_argument("--optimizer",
290 |                          choices=['adadelta',
291 |                                   'adam',
292 |                                   'rmsprop',
293 |                                   'rmspropcentered'],
294 |                          default='adam',
295 |                          help='Name of the optimizer to use to learn.')
296 |     arg_net.add_argument("--learning-rate", type=float, default=.00025,
297 |                          help='Learning rate value of the optimizer. Only used'
298 |                               'in rmspropcentered')
299 |     arg_net.add_argument("--decay", type=float, default=.95,
300 |                          help='Discount factor for the history coming from the'
301 |                               'gradient momentum in rmspropcentered')
302 |     arg_net.add_argument("--epsilon", type=float, default=1e-8,
303 |                          help='Epsilon term used in rmspropcentered')
304 | 
305 |     arg_alg = parser.add_argument_group('Algorithm')
306 |     arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn'],
307 |                          default='dqn',
308 |                          help='Name of the algorithm. dqn is for standard'
309 |                               'DQN, ddqn is for Double DQN and adqn is for'
310 |                               'Averaged DQN.')
311 |     arg_alg.add_argument("--features", choices=['relu', 'sigmoid'])
312 |     arg_alg.add_argument("--batch-size", type=int, default=32,
313 |                          help='Batch size for each fit of the network.')
314 |     arg_alg.add_argument("--history-length", type=int, default=4,
315 |                          help='Number of frames composing a state.')
316 |     arg_alg.add_argument("--target-update-frequency", type=int, default=10000,
317 |                          help='Number of collected samples before each update'
318 |                               'of the target network.')
319 |     arg_alg.add_argument("--evaluation-frequency", type=int, default=250000,
320 |                          help='Number of learning step before each evaluation.'
321 |                               'This number represents an epoch.')
322 |     arg_alg.add_argument("--train-frequency", type=int, default=4,
323 |                          help='Number of learning steps before each fit of the'
324 |                               'neural network.')
325 |     arg_alg.add_argument("--max-steps", type=int, default=50000000,
326 |                          help='Total number of learning steps.')
327 |     arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000,
328 |                          help='Number of steps until the exploration rate stops'
329 |                               'decreasing.')
330 |     arg_alg.add_argument("--initial-exploration-rate", type=float, default=1.,
331 |                          help='Initial value of the exploration rate.')
332 |     arg_alg.add_argument("--final-exploration-rate", type=float, default=.1,
333 |                          help='Final value of the exploration rate. When it'
334 |                               'reaches this values, it stays constant.')
335 |     arg_alg.add_argument("--test-exploration-rate", type=float, default=.05,
336 |                          help='Exploration rate used during evaluation.')
337 |     arg_alg.add_argument("--test-samples", type=int, default=125000,
338 |                          help='Number of steps for each evaluation.')
339 |     arg_alg.add_argument("--max-no-op-actions", type=int, default=30,
340 |                          help='Maximum number of no-op action performed at the'
341 |                               'beginning of the episodes. The minimum number is'
342 |                               'history_length.')
343 |     arg_alg.add_argument("--transfer", type=str, default='',
344 |                          help='Path to  the file of the weights of the common '
345 |                               'layers to be loaded')
346 |     arg_alg.add_argument("--save-shared", type=str, default='',
347 |                          help='filename where to save the shared weights')
348 |     arg_alg.add_argument("--unfreeze-epoch", type=int, default=0,
349 |                          help="Number of epoch where to unfreeze shared weights.")
350 | 
351 |     arg_utils = parser.add_argument_group('Utils')
352 |     arg_utils.add_argument('--use-cuda', action='store_true',
353 |                            help='Flag specifying whether to use the GPU.')
354 |     arg_utils.add_argument('--load', type=str,
355 |                            help='Path of the model to be loaded.')
356 |     arg_utils.add_argument('--save', action='store_true',
357 |                            help='Flag specifying whether to save the model.')
358 |     arg_utils.add_argument('--render', action='store_true',
359 |                            help='Flag specifying whether to render the game.')
360 |     arg_utils.add_argument('--quiet', action='store_true',
361 |                            help='Flag specifying whether to hide the progress'
362 |                                 'bar.')
363 |     arg_utils.add_argument('--debug', action='store_true',
364 |                            help='Flag specifying whether the script has to be'
365 |                                 'run in debug mode.')
366 |     arg_utils.add_argument('--postfix', type=str, default='',
367 |                            help='Flag used to add a postfix to the folder name')
368 | 
369 |     args = parser.parse_args()
370 | 
371 |     folder_name = './logs/gym_' + datetime.datetime.now().strftime(
372 |         '%Y-%m-%d_%H-%M-%S') + args.postfix + '/'
373 |     pathlib.Path(folder_name).mkdir(parents=True)
374 |     with open(folder_name + 'args.pkl', 'wb') as f:
375 |         pickle.dump(args, f)
376 | 
377 |     out = Parallel(n_jobs=-1)(delayed(experiment)(args, i)
378 |                               for i in range(args.n_exp))
379 | 
380 |     scores = np.array([o[0] for o in out])
381 |     loss = np.array([o[1] for o in out])
382 | 
383 |     np.save(folder_name + 'scores.npy', scores)
384 |     np.save(folder_name + 'loss.npy', loss)
385 | 


--------------------------------------------------------------------------------
/dqn/run_gym.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import pathlib
  4 | import sys
  5 | 
  6 | from joblib import delayed, Parallel
  7 | import numpy as np
  8 | import torch.optim as optim
  9 | 
 10 | import pickle
 11 | 
 12 | sys.path.append('..')
 13 | 
 14 | from mushroom_rl.approximators.parametric.torch_approximator import TorchApproximator
 15 | from mushroom_rl.core.environment import MDPInfo
 16 | from mushroom_rl.environments import *
 17 | from mushroom_rl.utils.dataset import compute_J
 18 | from mushroom_rl.utils.parameters import LinearParameter, Parameter
 19 | 
 20 | from core import Core
 21 | from dqn import DQN, DoubleDQN
 22 | from policy import EpsGreedyMultiple
 23 | from networks import GymNetwork
 24 | from losses import LossFunction
 25 | from replay_memory import PrioritizedReplayMemory
 26 | 
 27 | """
 28 | This script runs Atari experiments with DQN as presented in:
 29 | "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015.
 30 | 
 31 | """
 32 | 
 33 | 
 34 | def print_epoch(epoch):
 35 |     print('################################################################')
 36 |     print('Epoch: ', epoch)
 37 |     print('----------------------------------------------------------------')
 38 | 
 39 | 
 40 | def get_stats(dataset, gamma, idx, games):
 41 |     J = np.mean(compute_J(dataset, gamma[idx]))
 42 |     print(games[idx] + ': J: %f' % J)
 43 | 
 44 |     return J
 45 | 
 46 | 
 47 | def experiment(args, idx):
 48 |     np.random.seed()
 49 | 
 50 |     args.games = [''.join(g) for g in args.games]
 51 | 
 52 |     # MDP
 53 |     mdp = list()
 54 |     gamma_eval = list()
 55 |     for i, g in enumerate(args.games):
 56 |         if g == 'pendulum':
 57 |             mdp.append(CartPole(horizon=args.horizon[i], gamma=args.gamma[i]))
 58 |         elif g == 'caronhill':
 59 |             mdp.append(CarOnHill(horizon=args.horizon[i], gamma=args.gamma[i]))
 60 |         else:
 61 |             mdp.append(Gym(g, args.horizon[i], args.gamma[i]))
 62 | 
 63 |         gamma_eval.append(args.gamma[i])
 64 | 
 65 |     n_input_per_mdp = [m.info.observation_space.shape for m in mdp]
 66 |     n_actions_per_head = [(m.info.action_space.n,) for m in mdp]
 67 | 
 68 |     max_obs_dim = 0
 69 |     max_act_n = 0
 70 |     for i in range(len(args.games)):
 71 |         n = mdp[i].info.observation_space.shape[0]
 72 |         m = mdp[i].info.action_space.n
 73 |         if n > max_obs_dim:
 74 |             max_obs_dim = n
 75 |             max_obs_idx = i
 76 |         if m > max_act_n:
 77 |             max_act_n = m
 78 |             max_act_idx = i
 79 |     gammas = [m.info.gamma for m in mdp]
 80 |     horizons = [m.info.horizon for m in mdp]
 81 |     mdp_info = MDPInfo(mdp[max_obs_idx].info.observation_space,
 82 |                        mdp[max_act_idx].info.action_space, gammas, horizons)
 83 | 
 84 |     scores = list()
 85 |     for _ in range(len(args.games)):
 86 |         scores.append(list())
 87 | 
 88 |     optimizer = dict()
 89 |     if args.optimizer == 'adam':
 90 |         optimizer['class'] = optim.Adam
 91 |         optimizer['params'] = dict(lr=args.learning_rate,
 92 |                                    eps=args.epsilon)
 93 |     elif args.optimizer == 'adadelta':
 94 |         optimizer['class'] = optim.Adadelta
 95 |         optimizer['params'] = dict(lr=args.learning_rate,
 96 |                                    eps=args.epsilon)
 97 |     elif args.optimizer == 'rmsprop':
 98 |         optimizer['class'] = optim.RMSprop
 99 |         optimizer['params'] = dict(lr=args.learning_rate,
100 |                                    alpha=args.decay,
101 |                                    eps=args.epsilon)
102 |     elif args.optimizer == 'rmspropcentered':
103 |         optimizer['class'] = optim.RMSprop
104 |         optimizer['params'] = dict(lr=args.learning_rate,
105 |                                    alpha=args.decay,
106 |                                    eps=args.epsilon,
107 |                                    centered=True)
108 |     else:
109 |         raise ValueError
110 | 
111 |     # DQN learning run
112 | 
113 |     # Settings
114 |     if args.debug:
115 |         initial_replay_size = args.batch_size
116 |         max_replay_size = 500
117 |         train_frequency = 5
118 |         target_update_frequency = 10
119 |         test_samples = 20
120 |         evaluation_frequency = 50
121 |         max_steps = 1000
122 |     else:
123 |         initial_replay_size = args.initial_replay_size
124 |         max_replay_size = args.max_replay_size
125 |         train_frequency = args.train_frequency
126 |         target_update_frequency = args.target_update_frequency
127 |         test_samples = args.test_samples
128 |         evaluation_frequency = args.evaluation_frequency
129 |         max_steps = args.max_steps
130 | 
131 |     # Policy
132 |     epsilon = LinearParameter(value=args.initial_exploration_rate,
133 |                               threshold_value=args.final_exploration_rate,
134 |                               n=args.final_exploration_frame)
135 |     epsilon_test = Parameter(value=args.test_exploration_rate)
136 |     epsilon_random = Parameter(value=1)
137 |     pi = EpsGreedyMultiple(parameter=epsilon,
138 |                            n_actions_per_head=n_actions_per_head)
139 | 
140 |     # Approximator
141 |     input_shape = [m.info.observation_space.shape for m in mdp]
142 |     n_games = len(args.games)
143 |     loss = LossFunction(n_games, args.batch_size,
144 |                         args.evaluation_frequency)
145 | 
146 |     approximator_params = dict(
147 |         network=GymNetwork,
148 |         input_shape=input_shape,
149 |         output_shape=(max(n_actions_per_head)[0],),
150 |         n_actions=max(n_actions_per_head)[0],
151 |         n_actions_per_head=n_actions_per_head,
152 |         optimizer=optimizer,
153 |         loss=loss,
154 |         use_cuda=args.use_cuda,
155 |         features=args.features
156 |     )
157 | 
158 |     approximator = TorchApproximator
159 | 
160 |     if args.prioritized:
161 |         replay_memory = [PrioritizedReplayMemory(
162 |             initial_replay_size, max_replay_size, alpha=.6,
163 |             beta=LinearParameter(.4, threshold_value=1,
164 |                                  n=max_steps // train_frequency)
165 |         ) for _ in range(n_games)]
166 |     else:
167 |         replay_memory = None
168 | 
169 |     # Agent
170 |     algorithm_params = dict(
171 |         batch_size=args.batch_size,
172 |         n_games=len(args.games),
173 |         initial_replay_size=initial_replay_size,
174 |         max_replay_size=max_replay_size,
175 |         target_update_frequency=target_update_frequency // train_frequency,
176 |         replay_memory=replay_memory,
177 |         n_input_per_mdp=n_input_per_mdp,
178 |         n_actions_per_head=n_actions_per_head,
179 |         clip_reward=False,
180 |         history_length=args.history_length,
181 |         dtype=np.float32
182 |     )
183 | 
184 |     if args.algorithm == 'dqn':
185 |         agent = DQN(approximator, pi, mdp_info,
186 |                     approximator_params=approximator_params,
187 |                     **algorithm_params)
188 |     elif args.algorithm == 'ddqn':
189 |         agent = DoubleDQN(approximator, pi, mdp_info,
190 |                           approximator_params=approximator_params,
191 |                           **algorithm_params)
192 | 
193 |     # Algorithm
194 |     core = Core(agent, mdp)
195 | 
196 |     # RUN
197 | 
198 |     # Fill replay memory with random dataset
199 |     print_epoch(0)
200 |     pi.set_parameter(epsilon_random)
201 |     core.learn(n_steps=initial_replay_size,
202 |                n_steps_per_fit=initial_replay_size, quiet=args.quiet)
203 | 
204 |     if args.transfer:
205 |         weights = pickle.load(open(args.transfer, 'rb'))
206 |         agent.set_shared_weights(weights)
207 | 
208 |     if args.load:
209 |         weights = np.load(args.load)
210 |         agent.approximator.set_weights(weights)
211 | 
212 |     # Evaluate initial policy
213 |     pi.set_parameter(epsilon_test)
214 |     dataset = core.evaluate(n_steps=test_samples, render=args.render,
215 |                             quiet=args.quiet)
216 |     for i in range(len(mdp)):
217 |         d = dataset[i::len(mdp)]
218 |         scores[i].append(get_stats(d, gamma_eval, i, args.games))
219 | 
220 |     if args.unfreeze_epoch > 0:
221 |         agent.freeze_shared_weights()
222 | 
223 |     best_score_sum = -np.inf
224 |     best_weights = None
225 | 
226 |     np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
227 |     np.save(folder_name + 'loss-exp-%d.npy' % idx,
228 |             agent.approximator.model._loss.get_losses())
229 | 
230 |     for n_epoch in range(1, max_steps // evaluation_frequency + 1):
231 |         if n_epoch >= args.unfreeze_epoch > 0:
232 |             agent.unfreeze_shared_weights()
233 | 
234 |         print_epoch(n_epoch)
235 |         print('- Learning:')
236 |         # learning step
237 |         pi.set_parameter(None)
238 |         core.learn(n_steps=evaluation_frequency,
239 |                    n_steps_per_fit=train_frequency, quiet=args.quiet)
240 | 
241 |         print('- Evaluation:')
242 |         # evaluation step
243 |         pi.set_parameter(epsilon_test)
244 |         dataset = core.evaluate(n_steps=test_samples,
245 |                                 render=args.render, quiet=args.quiet)
246 | 
247 |         current_score_sum = 0
248 |         for i in range(len(mdp)):
249 |             d = dataset[i::len(mdp)]
250 |             current_score = get_stats(d, gamma_eval, i, args.games)
251 |             scores[i].append(current_score)
252 |             current_score_sum += current_score
253 | 
254 |         # Save shared weights if best score
255 |         if args.save_shared and current_score_sum >= best_score_sum:
256 |             best_score_sum = current_score_sum
257 |             best_weights = agent.get_shared_weights()
258 | 
259 |         if args.save:
260 |             np.save(folder_name + 'weights-exp-%d-%d.npy' % (idx, n_epoch),
261 |                     agent.approximator.get_weights())
262 | 
263 |         np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
264 |         np.save(folder_name + 'loss-exp-%d.npy' % idx,
265 |                 agent.approximator.model._loss.get_losses())
266 | 
267 |     if args.save_shared:
268 |         pickle.dump(best_weights, open(args.save_shared, 'wb'))
269 | 
270 |     return scores, agent.approximator.model._loss.get_losses()
271 | 
272 | 
273 | if __name__ == '__main__':
274 |     # Argument parser
275 |     parser = argparse.ArgumentParser()
276 | 
277 |     arg_game = parser.add_argument_group('Game')
278 |     arg_game.add_argument("--games",
279 |                           type=list,
280 |                           nargs='+',
281 |                           default=['Acrobot-v1'],
282 |                           help='Gym ID of the problem.')
283 |     arg_game.add_argument("--horizon", type=int, nargs='+')
284 |     arg_game.add_argument("--gamma", type=float, nargs='+')
285 |     arg_game.add_argument("--n-exp", type=int)
286 | 
287 |     arg_mem = parser.add_argument_group('Replay Memory')
288 |     arg_mem.add_argument("--initial-replay-size", type=int, default=100,
289 |                          help='Initial size of the replay memory.')
290 |     arg_mem.add_argument("--max-replay-size", type=int, default=5000,
291 |                          help='Max size of the replay memory.')
292 |     arg_mem.add_argument("--prioritized", action='store_true',
293 |                          help='Whether to use prioritized memory or not.')
294 | 
295 |     arg_net = parser.add_argument_group('Deep Q-Network')
296 |     arg_net.add_argument("--optimizer",
297 |                          choices=['adadelta',
298 |                                   'adam',
299 |                                   'rmsprop',
300 |                                   'rmspropcentered'],
301 |                          default='adam',
302 |                          help='Name of the optimizer to use to learn.')
303 |     arg_net.add_argument("--learning-rate", type=float, default=.001,
304 |                          help='Learning rate value of the optimizer. Only used'
305 |                               'in rmspropcentered')
306 |     arg_net.add_argument("--decay", type=float, default=.95,
307 |                          help='Discount factor for the history coming from the'
308 |                               'gradient momentum in rmspropcentered')
309 |     arg_net.add_argument("--epsilon", type=float, default=1e-8,
310 |                          help='Epsilon term used in rmspropcentered')
311 | 
312 |     arg_alg = parser.add_argument_group('Algorithm')
313 |     arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn'],
314 |                          default='dqn',
315 |                          help='Name of the algorithm. dqn is for standard'
316 |                               'DQN, ddqn is for Double DQN and adqn is for'
317 |                               'Averaged DQN.')
318 |     arg_alg.add_argument("--features", choices=['relu', 'sigmoid'])
319 |     arg_alg.add_argument("--batch-size", type=int, default=100,
320 |                          help='Batch size for each fit of the network.')
321 |     arg_alg.add_argument("--history-length", type=int, default=1,
322 |                          help='Number of frames composing a state.')
323 |     arg_alg.add_argument("--target-update-frequency", type=int, default=100,
324 |                          help='Number of collected samples before each update'
325 |                               'of the target network.')
326 |     arg_alg.add_argument("--evaluation-frequency", type=int, default=1000,
327 |                          help='Number of learning step before each evaluation.'
328 |                               'This number represents an epoch.')
329 |     arg_alg.add_argument("--train-frequency", type=int, default=1,
330 |                          help='Number of learning steps before each fit of the'
331 |                               'neural network.')
332 |     arg_alg.add_argument("--max-steps", type=int, default=50000,
333 |                          help='Total number of learning steps.')
334 |     arg_alg.add_argument("--final-exploration-frame", type=int, default=5000,
335 |                          help='Number of steps until the exploration rate stops'
336 |                               'decreasing.')
337 |     arg_alg.add_argument("--initial-exploration-rate", type=float, default=1.,
338 |                          help='Initial value of the exploration rate.')
339 |     arg_alg.add_argument("--final-exploration-rate", type=float, default=.01,
340 |                          help='Final value of the exploration rate. When it'
341 |                               'reaches this values, it stays constant.')
342 |     arg_alg.add_argument("--test-exploration-rate", type=float, default=0.,
343 |                          help='Exploration rate used during evaluation.')
344 |     arg_alg.add_argument("--test-samples", type=int, default=2000,
345 |                          help='Number of steps for each evaluation.')
346 |     arg_alg.add_argument("--max-no-op-actions", type=int, default=0,
347 |                          help='Maximum number of no-op action performed at the'
348 |                               'beginning of the episodes. The minimum number is'
349 |                               'history_length.')
350 |     arg_alg.add_argument("--transfer", type=str, default='',
351 |                          help='Path to  the file of the weights of the common '
352 |                               'layers to be loaded')
353 |     arg_alg.add_argument("--save-shared", type=str, default='',
354 |                          help='filename where to save the shared weights')
355 |     arg_alg.add_argument("--unfreeze-epoch", type=int, default=0,
356 |                          help="Number of epoch where to unfreeze shared weights.")
357 | 
358 |     arg_utils = parser.add_argument_group('Utils')
359 |     arg_utils.add_argument('--use-cuda', action='store_true',
360 |                            help='Flag specifying whether to use the GPU.')
361 |     arg_utils.add_argument('--load', type=str,
362 |                            help='Path of the model to be loaded.')
363 |     arg_utils.add_argument('--save', action='store_true',
364 |                            help='Flag specifying whether to save the model.')
365 |     arg_utils.add_argument('--render', action='store_true',
366 |                            help='Flag specifying whether to render the game.')
367 |     arg_utils.add_argument('--quiet', action='store_true',
368 |                            help='Flag specifying whether to hide the progress'
369 |                                 'bar.')
370 |     arg_utils.add_argument('--debug', action='store_true',
371 |                            help='Flag specifying whether the script has to be'
372 |                                 'run in debug mode.')
373 |     arg_utils.add_argument('--postfix', type=str, default='',
374 |                            help='Flag used to add a postfix to the folder name')
375 | 
376 |     args = parser.parse_args()
377 | 
378 |     folder_name = './logs/gym_' + datetime.datetime.now().strftime(
379 |         '%Y-%m-%d_%H-%M-%S') + args.postfix + '/'
380 |     pathlib.Path(folder_name).mkdir(parents=True)
381 |     with open(folder_name + 'args.pkl', 'wb') as f:
382 |         pickle.dump(args, f)
383 | 
384 |     out = Parallel(n_jobs=-1)(delayed(experiment)(args, i)
385 |                               for i in range(args.n_exp))
386 | 
387 |     scores = np.array([o[0] for o in out])
388 |     loss = np.array([o[1] for o in out])
389 | 
390 |     np.save(folder_name + 'scores.npy', scores)
391 |     np.save(folder_name + 'loss.npy', loss)
392 | 


--------------------------------------------------------------------------------
/fqi/car_on_hill.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.integrate import odeint
 3 | 
 4 | from mushroom_rl.environments import Environment, MDPInfo
 5 | from mushroom_rl.utils import spaces
 6 | 
 7 | 
 8 | class CarOnHill(Environment):
 9 |     """
10 |     The Car On Hill environment as presented in:
11 |     "Tree-Based Batch Mode Reinforcement Learning". Ernst D. et al.. 2005.
12 | 
13 |     """
14 |     def __init__(self, m, g, a, horizon=100, gamma=.95):
15 |         """
16 |         Constructor.
17 | 
18 |         """
19 |         # MDP parameters
20 |         self.max_pos = 1.
21 |         self.max_velocity = 3.
22 |         high = np.array([self.max_pos, self.max_velocity])
23 |         self._g = g
24 |         self._m = m
25 |         self._dt = .1
26 |         self._discrete_actions = [-a, a]
27 | 
28 |         # MDP properties
29 |         observation_space = spaces.Box(low=-high, high=high)
30 |         action_space = spaces.Discrete(2)
31 |         mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)
32 | 
33 |         super().__init__(mdp_info)
34 | 
35 |     def reset(self, state=None):
36 |         if state is None:
37 |             self._state = np.array([-0.5, 0])
38 |         else:
39 |             self._state = state
40 | 
41 |         return self._state
42 | 
43 |     def step(self, action):
44 |         action = self._discrete_actions[action[0]]
45 |         sa = np.append(self._state, action)
46 |         new_state = odeint(self._dpds, sa, [0, self._dt])
47 | 
48 |         self._state = new_state[-1, :-1]
49 | 
50 |         if self._state[0] < -self.max_pos or \
51 |                 np.abs(self._state[1]) > self.max_velocity:
52 |             reward = -1
53 |             absorbing = True
54 |         elif self._state[0] > self.max_pos and \
55 |                 np.abs(self._state[1]) <= self.max_velocity:
56 |             reward = 1
57 |             absorbing = True
58 |         else:
59 |             reward = 0
60 |             absorbing = False
61 | 
62 |         return self._state, reward, absorbing, {}
63 | 
64 |     def _dpds(self, state_action, t):
65 |         position = state_action[0]
66 |         velocity = state_action[1]
67 |         u = state_action[-1]
68 | 
69 |         if position < 0.:
70 |             diff_hill = 2 * position + 1
71 |             diff_2_hill = 2
72 |         else:
73 |             diff_hill = 1 / ((1 + 5 * position ** 2) ** 1.5)
74 |             diff_2_hill = (-15 * position) / ((1 + 5 * position ** 2) ** 2.5)
75 | 
76 |         dp = velocity
77 |         ds = (u - self._g * self._m * diff_hill - velocity ** 2 * self._m *
78 |               diff_hill * diff_2_hill) / (self._m * (1 + diff_hill ** 2))
79 | 
80 |         return dp, ds, 0.
81 | 


--------------------------------------------------------------------------------
/fqi/dataset_0.800_4.000.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_0.800_4.000.pkl


--------------------------------------------------------------------------------
/fqi/dataset_0.850_4.000.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_0.850_4.000.pkl


--------------------------------------------------------------------------------
/fqi/dataset_0.900_4.000.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_0.900_4.000.pkl


--------------------------------------------------------------------------------
/fqi/dataset_0.950_4.000.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_0.950_4.000.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.000_4.000.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.000.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.000_4.125.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.125.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.000_4.250.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.250.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.000_4.375.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.375.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.000_4.500.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.500.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.050_4.500.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.050_4.500.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.100_4.500.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.100_4.500.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.150_4.500.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.150_4.500.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.200_4.375.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.200_4.375.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.200_4.500.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.200_4.500.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.200_4.625.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.200_4.625.pkl


--------------------------------------------------------------------------------
/fqi/dataset_1.200_4.750.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.200_4.750.pkl


--------------------------------------------------------------------------------
/fqi/fqi.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from mushroom.algorithms.value.batch_td import BatchTD
 4 | 
 5 | 
 6 | 
 7 | class FQI(BatchTD):
 8 |     """
 9 |     Fitted Q-Iteration algorithm.
10 |     "Tree-Based Batch Mode Reinforcement Learning", Ernst D. et al.. 2005.
11 | 
12 |     """
13 |     def __init__(self, approximator, policy, mdp_info, n_iterations,
14 |                  n_actions_per_head, fit_params=None,
15 |                  approximator_params=None, quiet=False):
16 |         """
17 |         Constructor.
18 | 
19 |         Args:
20 |             n_iterations (int): number of iterations to perform for training;
21 |             quiet (bool, False): whether to show the progress bar or not.
22 | 
23 |         """
24 |         self._n_iterations = n_iterations
25 |         self._n_actions_per_head = n_actions_per_head
26 |         self._n_games = len(self._n_actions_per_head)
27 |         self._quiet = quiet
28 | 
29 |         super().__init__(mdp_info, policy, approximator, approximator_params,
30 |                          fit_params)
31 | 
32 |         self._target = None
33 | 
34 |     def _fit(self, state, action, reward, next_state, absorbing, idxs):
35 |         """
36 |         Single fit iteration.
37 | 
38 |         Args:
39 |             x (list): the dataset.
40 | 
41 |         """
42 |         if self._target is None:
43 |             self._target = reward.copy()
44 |         else:
45 |             q = self.approximator.predict(next_state, idx=idxs)
46 |             if np.any(absorbing):
47 |                 q *= 1 - absorbing.reshape(-1, 1)
48 | 
49 |             max_q = np.max(q, axis=1)
50 |             self._target = reward + self.mdp_info.gamma * max_q
51 | 
52 |         self.approximator.fit(state, action, self._target, idx=idxs,
53 |                               **self._fit_params)
54 | 


--------------------------------------------------------------------------------
/fqi/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class LossFunction(object):
 6 |     def __init__(self, n_games):
 7 |         self._n_games = n_games
 8 | 
 9 |         self._losses = list()
10 |         self._counter = 0
11 | 
12 |     def get_losses(self):
13 |         return self._losses
14 | 
15 |     def __call__(self, yhat, y):
16 |         loss = F.mse_loss(yhat, y, reduce=True)
17 | 
18 |         return loss
19 | 
20 |     def _need_log(self):
21 |         self._counter += 1
22 |         if self._counter >= self._eval_frequency:
23 |             self._counter = 0
24 |             return True
25 |         else:
26 |             return False
27 | 


--------------------------------------------------------------------------------
/fqi/networks.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | import numpy as np
  6 | 
  7 | 
  8 | class Network(nn.Module):
  9 |     def __init__(self, input_shape, output_shape, use_cuda, features,
 10 |                  dropout, n_features=5):
 11 |         super().__init__()
 12 | 
 13 |         self._n_input = input_shape
 14 |         self._n_output = output_shape
 15 |         self._n_games = len(self._n_input)
 16 |         self._use_cuda = use_cuda
 17 |         self._n_shared = 2
 18 |         self._features = features
 19 | 
 20 |         self._h1 = nn.ModuleList(
 21 |             [nn.Linear(self._n_input[i][0], n_features) for i in range(
 22 |                 len(input_shape))]
 23 |         )
 24 |         self._h2 = nn.Linear(n_features, n_features)
 25 |         self._q = nn.ModuleList(
 26 |             [nn.Linear(n_features, self._n_output[i][0]) for i in range(
 27 |                 self._n_games)]
 28 |         )
 29 | 
 30 |         self.weights_init()
 31 | 
 32 |     def forward(self, state, action=None, idx=None):
 33 |         state = state.float()
 34 | 
 35 |         h1 = list()
 36 |         for i in np.unique(idx):
 37 |             idxs = np.argwhere(idx == i).ravel()
 38 |             h1.append(torch.sigmoid(self._h1[i](state[idxs, :self._n_input[i][0]])))
 39 |         cat_h1 = torch.cat(h1)
 40 | 
 41 |         if self._features == 'relu':
 42 |             h_f = F.relu(self._h2(cat_h1))
 43 |         elif self._features == 'sigmoid':
 44 |             h_f = torch.sigmoid(self._h2(cat_h1))
 45 |         else:
 46 |             raise ValueError
 47 | 
 48 |         q = [torch.tanh(self._q[i](h_f)) for i in range(self._n_games)]
 49 |         q = torch.stack(q, dim=1)
 50 | 
 51 |         if action is not None:
 52 |             action = action.long()
 53 |             q_acted = torch.squeeze(
 54 |                 q.gather(2, action.repeat(1, self._n_games).unsqueeze(-1)), -1)
 55 | 
 56 |             q = q_acted
 57 | 
 58 |         if idx is not None:
 59 |             idx = torch.from_numpy(idx)
 60 |             if self._use_cuda:
 61 |                 idx = idx.cuda()
 62 |             if q.dim() == 2:
 63 |                 q_idx = q.gather(1, idx.unsqueeze(-1))
 64 |             else:
 65 |                 q_idx = q.gather(1, idx.view(-1, 1).repeat(
 66 |                     1, self._n_output[0][0]).unsqueeze(1))
 67 | 
 68 |             q = torch.squeeze(q_idx, 1)
 69 | 
 70 |         return q
 71 | 
 72 |     def get_shared_weights(self):
 73 |         p2 = list()
 74 | 
 75 |         for p in self._h2.parameters():
 76 |             p2.append(p.data.detach().cpu().numpy())
 77 | 
 78 |         return p2
 79 | 
 80 |     def weights_init(self):
 81 |         nn.init.xavier_uniform_(self._h2.weight,
 82 |                                 gain=nn.init.calculate_gain('relu'))
 83 |         for i in range(self._n_games):
 84 |             nn.init.xavier_uniform_(self._h1[i].weight,
 85 |                                     gain=nn.init.calculate_gain('relu'))
 86 |             nn.init.xavier_uniform_(self._q[i].weight,
 87 |                                     gain=nn.init.calculate_gain('linear'))
 88 | 
 89 |     def set_shared_weights(self, weights):
 90 |         w2 = weights
 91 | 
 92 |         for p, w in zip(self._h2.parameters(), w2):
 93 |             w_tensor = torch.from_numpy(w).type(p.data.dtype)
 94 |             if self._use_cuda:
 95 |                 w_tensor = w_tensor.cuda()
 96 |             p.data = w_tensor
 97 | 
 98 |     def freeze_shared_weights(self):
 99 |         for p in self._h2.parameters():
100 |             p.requires_grad = False
101 | 
102 |     def unfreeze_shared_weights(self):
103 |         for p in self._h2.parameters():
104 |             p.requires_grad = True
105 | 


--------------------------------------------------------------------------------
/fqi/run_coh.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | import pickle
  3 | import sys
  4 | 
  5 | import numpy as np
  6 | from joblib import Parallel, delayed
  7 | import torch.optim as optim
  8 | from tqdm import trange
  9 | 
 10 | sys.path.append('..')
 11 | 
 12 | from mushroom.approximators.parametric.torch_approximator import TorchApproximator
 13 | from mushroom.utils.dataset import compute_J, parse_dataset
 14 | from mushroom.utils.parameters import Parameter
 15 | 
 16 | from car_on_hill import CarOnHill
 17 | from core import Core
 18 | from fqi import FQI
 19 | from losses import LossFunction
 20 | from networks import Network
 21 | from policy import EpsGreedyMultiple
 22 | from solver import solve_car_on_hill
 23 | 
 24 | """
 25 | This script aims to replicate the experiments on the Car on Hill MDP as
 26 | presented in:
 27 | "Tree-Based Batch Mode Reinforcement Learning", Ernst D. et al.. 2005. 
 28 | 
 29 | """
 30 | 
 31 | 
 32 | def get_stats(dataset, gamma):
 33 |     J = np.mean(compute_J(dataset, gamma))
 34 | 
 35 |     return J
 36 | 
 37 | 
 38 | def experiment(mdp, test_states, test_actions, test_q, names):
 39 |     np.random.seed()
 40 | 
 41 |     n_games = len(mdp)
 42 |     input_shape = [(m.info.observation_space.shape[0],) for m in mdp]
 43 |     n_actions_per_head = [(m.info.action_space.n,) for m in mdp]
 44 | 
 45 |     test_states = np.array([test_states]).repeat(len(mdp), 0).reshape(-1, 2)
 46 |     test_actions = np.array([test_actions]).repeat(len(mdp), 0).reshape(-1, 1)
 47 |     test_idxs = np.ones(len(test_states), dtype=np.int) * np.arange(len(mdp)).repeat(
 48 |         len(test_states) // len(mdp), 0)
 49 | 
 50 |     # Policy
 51 |     epsilon = Parameter(value=1.)
 52 |     pi = EpsGreedyMultiple(parameter=epsilon,
 53 |                            n_actions_per_head=n_actions_per_head)
 54 | 
 55 |     # Approximator
 56 |     optimizer = {'class': optim.Adam, 'params': dict()}
 57 |     loss = LossFunction(n_games)
 58 | 
 59 |     approximator_params = dict(
 60 |         network=Network,
 61 |         input_shape=input_shape,
 62 |         output_shape=n_actions_per_head,
 63 |         optimizer=optimizer,
 64 |         loss=loss,
 65 |         features='sigmoid',
 66 |         n_features=30,
 67 |         use_cuda=True,
 68 |         quiet=False
 69 |     )
 70 | 
 71 |     approximator = TorchApproximator
 72 | 
 73 |     dataset = list()
 74 |     len_datasets = list()
 75 |     for i in range(len(mdp)):
 76 |         d = pickle.load(open('dataset_%s.pkl' % names[i], 'rb'))
 77 |         len_datasets.append(len(d))
 78 |         dataset += d
 79 | 
 80 |     # Agent
 81 |     algorithm_params = dict(n_iterations=1,
 82 |                             n_actions_per_head=n_actions_per_head,
 83 |                             fit_params=dict(patience=100, epsilon=1e-6))
 84 |     agent = FQI(approximator, pi, mdp[0].info,
 85 |                 approximator_params=approximator_params, **algorithm_params)
 86 | 
 87 |     qs = list()
 88 |     scores = list()
 89 | 
 90 |     idxs = list()
 91 |     for i, l in enumerate(len_datasets):
 92 |         idxs += (np.ones(l, dtype=np.int) * i).tolist()
 93 |     idxs = np.array(idxs)
 94 | 
 95 |     state, action, reward, next_state, absorbing, _ = parse_dataset(dataset)
 96 |     for _ in trange(50, dynamic_ncols=True, disable=False, leave=False):
 97 |         agent._fit(state, action, reward, next_state, absorbing, idxs)
 98 |         # Algorithm
 99 |         core = Core(agent, mdp)
100 |         test_epsilon = Parameter(0.)
101 |         pi.set_parameter(test_epsilon)
102 |         dataset = core.evaluate(n_steps=100)
103 | 
104 |         qs.append(agent.approximator.predict(test_states, test_actions,
105 |                                              idx=test_idxs))
106 |         scores.append(np.mean(compute_J(dataset, mdp[0].info.gamma)))
107 | 
108 |     qs_hat = np.array(qs)
109 |     avi_diff = list()
110 |     for i in range(len(qs_hat)):
111 |         avi_diff.append(np.linalg.norm(qs_hat[i] - test_q, ord=1) / len(test_q))
112 | 
113 |     print(avi_diff, scores)
114 | 
115 |     return avi_diff, scores
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     n_exp = 100
120 | 
121 |     use_mdp = np.array([0])  # , 4, 8, 13, 1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 14, 15])
122 | 
123 |     load_test_q = True
124 | 
125 |     # MDP
126 |     all_mdps = [
127 |         CarOnHill(1, 9.81, 4), CarOnHill(1, 9.81, 4.125), CarOnHill(1, 9.81, 4.25), CarOnHill(1, 9.81, 4.375),
128 |         CarOnHill(.8, 9.81, 4), CarOnHill(.85, 9.81, 4), CarOnHill(.9, 9.81, 4), CarOnHill(.95, 9.81, 4),
129 |         CarOnHill(1, 9.81, 4.5), CarOnHill(1.05, 9.81, 4.5), CarOnHill(1.1, 9.81, 4.5), CarOnHill(1.15, 9.81, 4.5),
130 |         CarOnHill(1.2, 9.81, 4.375), CarOnHill(1.2, 9.81, 4.5), CarOnHill(1.2, 9.81, 4.625), CarOnHill(1.2, 9.81, 4.75)
131 |     ]
132 | 
133 |     mdp = list()
134 |     for i in use_mdp:
135 |         mdp.append(all_mdps[i])
136 | 
137 |     names = ['%1.3f_%1.3f' % (m._m, m._discrete_actions[-1]) for m in mdp]
138 | 
139 |     test_states_0 = np.linspace(mdp[0].info.observation_space.low[0],
140 |                                 mdp[0].info.observation_space.high[0], 10)
141 |     test_states_1 = np.linspace(mdp[0].info.observation_space.low[1],
142 |                                 mdp[0].info.observation_space.high[1], 10)
143 |     test_states = list()
144 |     for s0 in test_states_0:
145 |         for s1 in test_states_1:
146 |             test_states += [s0, s1]
147 |     test_states = np.array([test_states]).repeat(2, 0).reshape(-1, 2)
148 |     test_actions = np.array(
149 |         [np.zeros(len(test_states) // 2),
150 |          np.ones(len(test_states) // 2)]).reshape(-1, 1).astype(np.int)
151 | 
152 |     # Test Q
153 |     test_q = list()
154 |     if not load_test_q:
155 |         for i, j in enumerate(use_mdp):
156 |             current_test_q = solve_car_on_hill(all_mdps[j], test_states,
157 |                                                test_actions,
158 |                                                all_mdps[j].info.gamma)
159 |             np.save('test_q_%s.npy' % names[i], current_test_q)
160 | 
161 |             test_q += current_test_q
162 |     else:
163 |         for i in range(len(mdp)):
164 |             test_q += np.load('test_q_%s.npy' % names[i]).tolist()
165 | 
166 |     test_q = np.array(test_q)
167 | 
168 |     folder_name = './logs/%s/' % ''.join(names)
169 |     pathlib.Path(folder_name).mkdir(parents=True, exist_ok=True)
170 | 
171 |     out = Parallel(n_jobs=8)(delayed(experiment)(
172 |         mdp, test_states, test_actions, test_q, names) for i in range(n_exp))
173 | 
174 |     avi_diff = np.array([o[0] for o in out])
175 |     scores = np.array([o[1] for o in out])
176 | 
177 |     np.save(folder_name + 'avi_diff.npy', avi_diff)
178 |     np.save(folder_name + 'scores.npy', scores)
179 | 


--------------------------------------------------------------------------------
/fqi/solver.py:
--------------------------------------------------------------------------------
 1 | def step(mdp, state, action):
 2 |     mdp.reset(state)
 3 | 
 4 |     return mdp.step(action)
 5 | 
 6 | 
 7 | def bfs(mdp, frontier, k, max_k):
 8 |     if len(frontier) == 0 or k == max_k:
 9 |         return False, k
10 | 
11 |     new_frontier = list()
12 |     for f in frontier:
13 |         s, r, _, _ = step(mdp, f, [0])
14 |         if r == 1:
15 |             return True, k
16 |         elif r == 0:
17 |             new_frontier.append(s)
18 | 
19 |         s, r, _, _ = step(mdp, f, [1])
20 |         if r == 1:
21 |             return True, k
22 |         elif r == 0:
23 |             new_frontier.append(s)
24 | 
25 |     return bfs(mdp, new_frontier, k + 1, max_k)
26 | 
27 | 
28 | def solve_car_on_hill(mdp, states, actions, gamma, max_k=50):
29 |     q = list()
30 |     for s, a in zip(states, actions):
31 |         mdp.reset(s)
32 |         state, reward, _, _ = mdp.step(a)
33 | 
34 |         if reward == 1:
35 |             k = 1
36 |             success = True
37 |         elif reward == -1:
38 |             k = 1
39 |             success = False
40 |         else:
41 |             success, k = bfs(mdp, [state], 2, max_k)
42 | 
43 |         if success:
44 |             q.append(gamma ** (k - 1))
45 |         else:
46 |             q.append(-gamma ** (k - 1))
47 | 
48 |     return q
49 | 


--------------------------------------------------------------------------------
/fqi/test_q_0.800_4.000.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_0.800_4.000.npy


--------------------------------------------------------------------------------
/fqi/test_q_0.850_4.000.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_0.850_4.000.npy


--------------------------------------------------------------------------------
/fqi/test_q_0.900_4.000.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_0.900_4.000.npy


--------------------------------------------------------------------------------
/fqi/test_q_0.950_4.000.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_0.950_4.000.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.000_4.000.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.000.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.000_4.125.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.125.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.000_4.250.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.250.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.000_4.375.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.375.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.000_4.500.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.500.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.050_4.500.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.050_4.500.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.100_4.500.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.100_4.500.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.150_4.500.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.150_4.500.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.200_4.375.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.200_4.375.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.200_4.500.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.200_4.500.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.200_4.625.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.200_4.625.npy


--------------------------------------------------------------------------------
/fqi/test_q_1.200_4.750.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.200_4.750.npy


--------------------------------------------------------------------------------
/policy.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | import numpy as np
  4 | 
  5 | from mushroom_rl.policy import ParametricPolicy, TDPolicy
  6 | from mushroom_rl.utils.parameters import Parameter
  7 | 
  8 | 
  9 | class Multiple(TDPolicy):
 10 |     def __init__(self, parameter, n_actions_per_head):
 11 |         super().__init__()
 12 | 
 13 |         assert isinstance(parameter, Parameter) and\
 14 |             isinstance(n_actions_per_head, list) or isinstance(n_actions_per_head,
 15 |                                                                np.ndarray)
 16 |         self._n_actions_per_head = n_actions_per_head
 17 | 
 18 |         n_heads = len(n_actions_per_head)
 19 | 
 20 |         if isinstance(parameter, list):
 21 |             self._explorative_pars = deepcopy(parameter)
 22 |         else:
 23 |             self._explorative_pars = [deepcopy(parameter) for _ in range(n_heads)]
 24 |         self._pars = [None] * n_heads
 25 | 
 26 |     def set_parameter(self, parameter):
 27 |         assert isinstance(parameter, Parameter) or parameter is None
 28 | 
 29 |         if parameter is None:
 30 |             for i in range(len(self._pars)):
 31 |                 self._pars[i] = self._explorative_pars[i]
 32 |         else:
 33 |             for i in range(len(self._pars)):
 34 |                 self._pars[i] = parameter
 35 | 
 36 |     def update(self, state):
 37 |         idx = state[0]
 38 |         self._pars[idx].update(state)
 39 | 
 40 | 
 41 | class EpsGreedyMultiple(Multiple):
 42 |     def __call__(self, *args):
 43 |         idx = args[0]
 44 |         state = np.array(args[1])
 45 |         q = self._approximator.predict(
 46 |             np.expand_dims(state, axis=0),
 47 |             idx=idx).ravel()[:self._n_actions_per_head[idx][0]]
 48 |         max_a = np.argwhere(q == np.max(q)).ravel()
 49 | 
 50 |         p = self._epsilon.get_value(state) / self._n_actions_per_head[idx][0]
 51 | 
 52 |         if len(args) == 2:
 53 |             action = args[1]
 54 |             if action in max_a:
 55 |                 return p + (1. - self._epsilon.get_value(state)) / len(max_a)
 56 |             else:
 57 |                 return p
 58 |         else:
 59 |             probs = np.ones(self._n_actions_per_head[idx][0]) * p
 60 |             probs[max_a] += (1. - self._epsilon.get_value(state)) / len(max_a)
 61 | 
 62 |             return probs
 63 | 
 64 |     def draw_action(self, state):
 65 |         idx = state[0]
 66 |         state = np.array(state[1])
 67 |         if not np.random.uniform() < self._pars[idx](state):
 68 |             q = self._approximator.predict(
 69 |                 state, idx=np.array([idx]))[:self._n_actions_per_head[idx][0]]
 70 |             max_a = np.argwhere(q == np.max(q)).ravel()
 71 | 
 72 |             if len(max_a) > 1:
 73 |                 max_a = np.array([np.random.choice(
 74 |                     max_a[max_a < self._n_actions_per_head[idx][0]]
 75 |                 )])
 76 | 
 77 |             return max_a
 78 | 
 79 |         return np.array([np.random.choice(self._n_actions_per_head[idx][0])])
 80 | 
 81 | 
 82 | class OrnsteinUhlenbeckPolicy(ParametricPolicy):
 83 |     def __init__(self, mu, sigma, theta, dt, n_actions_per_head,
 84 |                  max_action_value, x0=None):
 85 | 
 86 |         self._approximator = mu
 87 |         self._sigma = sigma
 88 |         self._theta = theta
 89 |         self._dt = dt
 90 |         self._max_action_value = max_action_value
 91 |         self._x0 = x0
 92 | 
 93 |         self._n_games = len(n_actions_per_head)
 94 | 
 95 |         self._n_actions_per_head = n_actions_per_head
 96 | 
 97 |         self.eval = None
 98 | 
 99 |     def __call__(self, state, action):
100 |         raise NotImplementedError
101 | 
102 |     def draw_action(self, state):
103 |         idx = state[0]
104 |         state = state[1]
105 |         mu = self._approximator.predict(state, idx=np.array([idx])) * self._max_action_value[idx]
106 | 
107 |         x = self._x_prev[idx] - self._theta * self._x_prev[idx] * self._dt + self._sigma *\
108 |             np.sqrt(self._dt) * np.random.normal(size=self._approximator.output_shape)
109 |         self._x_prev[idx] = x
110 | 
111 |         if not self.eval:
112 |             return mu[:self._n_actions_per_head[idx][0]] + x[:self._n_actions_per_head[idx][0]]
113 |         else:
114 |             return mu[:self._n_actions_per_head[idx][0]]
115 | 
116 |     def set_weights(self, weights):
117 |         self._approximator.set_weights(weights)
118 | 
119 |     def get_weights(self):
120 |         return self._approximator.get_weights()
121 | 
122 |     @property
123 |     def weights_size(self):
124 |         return self._approximator.weights_size
125 | 
126 |     def reset(self):
127 |         self._x_prev = list()
128 |         for i in range(self._n_games):
129 |             self._x_prev.append(self._x0 if self._x0 is not None else np.zeros(self._approximator.output_shape))
130 | 


--------------------------------------------------------------------------------
/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from mushroom_rl.utils.replay_memory import PrioritizedReplayMemory, ReplayMemory, SumTree
 4 | 
 5 | 
 6 | class ReplayMemory(ReplayMemory):
 7 |     def add(self, dataset):
 8 |         for i in range(len(dataset)):
 9 |             self._states[self._idx] = dataset[i][0][1]
10 |             self._actions[self._idx] = dataset[i][1]
11 |             self._rewards[self._idx] = dataset[i][2]
12 |             self._next_states[self._idx] = dataset[i][3][1]
13 |             self._absorbing[self._idx] = dataset[i][4]
14 |             self._last[self._idx] = dataset[i][5]
15 | 
16 |             self._idx += 1
17 |             if self._idx == self._max_size:
18 |                 self._full = True
19 |                 self._idx = 0
20 | 
21 | 
22 | class PrioritizedReplayMemory(PrioritizedReplayMemory):
23 |     def __init__(self, initial_size, max_size, alpha, beta,
24 |                  epsilon=.01):
25 |         self._initial_size = initial_size
26 |         self._max_size = max_size
27 |         self._alpha = alpha
28 |         self._beta = beta
29 |         self._epsilon = epsilon
30 | 
31 |         self._tree = SumTree(max_size)
32 | 
33 |     def get(self, n_samples):
34 |         states = [None for _ in range(n_samples)]
35 |         actions = [None for _ in range(n_samples)]
36 |         rewards = [None for _ in range(n_samples)]
37 |         next_states = [None for _ in range(n_samples)]
38 |         absorbing = [None for _ in range(n_samples)]
39 |         last = [None for _ in range(n_samples)]
40 | 
41 |         idxs = np.zeros(n_samples, dtype=np.int)
42 |         priorities = np.zeros(n_samples)
43 | 
44 |         total_p = self._tree.total_p
45 |         segment = total_p / n_samples
46 | 
47 |         a = np.arange(n_samples) * segment
48 |         b = np.arange(1, n_samples + 1) * segment
49 |         samples = np.random.uniform(a, b)
50 |         for i, s in enumerate(samples):
51 |             idx, p, data = self._tree.get(s)
52 | 
53 |             idxs[i] = idx
54 |             priorities[i] = p
55 |             states[i], actions[i], rewards[i], next_states[i], absorbing[i],\
56 |                 last[i] = data
57 |             states[i] = np.array(states[i][1])
58 |             next_states[i] = np.array(next_states[i][1])
59 | 
60 |         sampling_probabilities = priorities / self._tree.total_p
61 |         is_weight = (self._tree.size * sampling_probabilities) ** -self._beta()
62 |         is_weight /= is_weight.max()
63 | 
64 |         return np.array(states), np.array(actions), np.array(rewards),\
65 |             np.array(next_states), np.array(absorbing), np.array(last),\
66 |             idxs, is_weight
67 | 


--------------------------------------------------------------------------------
/results/ddpg/multi_pendulum/noreg-sigmoid/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_pendulum/noreg-sigmoid/scores.npy


--------------------------------------------------------------------------------
/results/ddpg/multi_pendulum/transfer/noreg/unfreeze0-noreg-sigmoid.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_pendulum/transfer/noreg/unfreeze0-noreg-sigmoid.npy


--------------------------------------------------------------------------------
/results/ddpg/multi_pendulum/transfer/noreg/unfreeze101-noreg-sigmoid.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_pendulum/transfer/noreg/unfreeze101-noreg-sigmoid.npy


--------------------------------------------------------------------------------
/results/ddpg/multi_pendulum/transfer/noreg/w.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_pendulum/transfer/noreg/w.pkl


--------------------------------------------------------------------------------
/results/ddpg/multi_walker/noreg-sigmoid/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_walker/noreg-sigmoid/scores.npy


--------------------------------------------------------------------------------
/results/ddpg/multi_walker/transfer/noreg/unfreeze0-noreg-sigmoid.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_walker/transfer/noreg/unfreeze0-noreg-sigmoid.npy


--------------------------------------------------------------------------------
/results/ddpg/multi_walker/transfer/noreg/unfreeze101-noreg-sigmoid.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_walker/transfer/noreg/unfreeze101-noreg-sigmoid.npy


--------------------------------------------------------------------------------
/results/ddpg/multi_walker/transfer/noreg/w.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_walker/transfer/noreg/w.pkl


--------------------------------------------------------------------------------
/results/ddpg/scores-plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats as st
 3 | from matplotlib import pyplot as plt
 4 | 
 5 | 
 6 | def get_mean_and_confidence(data):
 7 |     mean = np.mean(data, axis=0)
 8 |     se = st.sem(data, axis=0)
 9 |     n = len(data)
10 | 
11 |     interval, _ = st.t.interval(0.95, n-1, scale=se)
12 | 
13 |     return mean, interval
14 | 
15 | show_pendulum = False
16 | leg_idx = 0 if show_pendulum else -1
17 | 
18 | if show_pendulum:
19 |     alg = 'multi_pendulum'
20 |     games = ['InvertedPendulumBulletEnv-v0', 'InvertedDoublePendulumBulletEnv-v0',
21 |              'InvertedPendulumSwingupBulletEnv-v0']
22 |     titles = ['Inverted-Pendulum', 'Inverted-Double-Pendulum', 'Inverted-Pendulum-Swingup']
23 | else:
24 |     alg = 'multi_walker'
25 |     games = ['hop_stand', 'walk_walk', 'chee_run']
26 |     titles = ['Hopper', 'Walker', 'Half-Cheetah']
27 | 
28 | reg = ['noreg']
29 | activation = ['sigmoid']
30 | 
31 | n_games = len(games)
32 | 
33 | legend_items = list()
34 | 
35 | fig, ax = plt.subplots(1, n_games)
36 | for i, t in enumerate(titles):
37 |     ax[i].set_title(t, fontsize=22)
38 |     ax[i].grid()
39 | 
40 | for r in reg:
41 |     for act in activation:
42 |         name = r + '-' + act
43 |         legend_items.append('single ' + name)
44 |         for i, g in enumerate(games):
45 |             path = 'single/' + name + '/' + g
46 |             a = np.load(path + '.npy')
47 |             a_mean, a_err = get_mean_and_confidence(a)
48 |             ax[i].plot(a_mean[0], linewidth=3)
49 |             ax[i].fill_between(np.arange(len(a_mean[0])),
50 |                                a_mean[0] - a_err[0],
51 |                                a_mean[0] + a_err[0], alpha=.5)
52 | 
53 | if alg != '':
54 |     for r in reg:
55 |         for act in activation:
56 |             name = r + '-' + act
57 |             path = alg + '/' + name + '/'
58 |     
59 |             legend_items.append(name)
60 |             a = np.load(path + 'scores.npy')
61 |             a_mean, a_err = get_mean_and_confidence(a)
62 |             for i, g in enumerate(games):
63 |                 ax[i].plot(a_mean[i], linewidth=3)
64 |                 ax[i].fill_between(np.arange(len(a_mean[i])), a_mean[i] - a_err[i], a_mean[i] + a_err[i], alpha=.5)
65 |                 ax[i].set_xlabel('#Epochs', fontsize=22)
66 |                 if i == 0:
67 |                     ax[i].set_ylabel('Performance', fontsize=22)
68 |                 for tick in ax[i].xaxis.get_major_ticks():
69 |                     tick.label.set_fontsize(22)
70 |                     tick.label
71 |                 for tick in ax[i].yaxis.get_major_ticks():
72 |                     tick.label.set_fontsize(22)
73 |                 ax[i].set_xticks([0, 50, 100])
74 | 
75 | ax[leg_idx].legend(['DDPG', 'MULTI'], loc='lower right', fontsize=22)
76 | plt.show()
77 | 


--------------------------------------------------------------------------------
/results/ddpg/single/noreg-sigmoid/InvertedDoublePendulumBulletEnv-v0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/InvertedDoublePendulumBulletEnv-v0.npy


--------------------------------------------------------------------------------
/results/ddpg/single/noreg-sigmoid/InvertedPendulumBulletEnv-v0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/InvertedPendulumBulletEnv-v0.npy


--------------------------------------------------------------------------------
/results/ddpg/single/noreg-sigmoid/InvertedPendulumSwingupBulletEnv-v0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/InvertedPendulumSwingupBulletEnv-v0.npy


--------------------------------------------------------------------------------
/results/ddpg/single/noreg-sigmoid/chee_run.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/chee_run.npy


--------------------------------------------------------------------------------
/results/ddpg/single/noreg-sigmoid/hop_stand.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/hop_stand.npy


--------------------------------------------------------------------------------
/results/ddpg/single/noreg-sigmoid/walk_walk.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/walk_walk.npy


--------------------------------------------------------------------------------
/results/ddpg/transfer-plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats as st
 3 | from matplotlib import pyplot as plt
 4 | 
 5 | 
 6 | def get_mean_and_confidence(data):
 7 |     mean = np.mean(data, axis=0)
 8 |     se = st.sem(data, axis=0)
 9 |     n = len(data)
10 | 
11 |     interval, _ = st.t.interval(0.95, n-1, scale=se)
12 | 
13 |     return mean, interval
14 |     
15 | show_pendulum = False
16 | leg_idx = 0 if show_pendulum else -1
17 | 
18 | if show_pendulum:
19 |     alg = 'multi_pendulum'
20 |     game = 'InvertedDoublePendulumBulletEnv-v0'
21 |     title = 'Inverted-Double-Pendulum'
22 | else:
23 |     alg = 'multi_walker'
24 |     game = 'hop_stand'
25 |     title = 'Hopper'
26 | 
27 | games = ['noreg']
28 | game_ids = [0]
29 | reg = ['noreg']
30 | activation = ['sigmoid']
31 | n_games = len(games)
32 | unfreezes = [0, 101]
33 | 
34 | legend_items = list()
35 | 
36 | fig, ax = plt.subplots(n_games, 1)
37 | # for i, g in enumerate(games):
38 | #     ax.set_title(g)
39 | #     ax.grid()
40 | 
41 | for act in activation:
42 |     for r in reg:
43 |         legend_items.append('No initialization')
44 |         path = 'single/' + r + '-' + act + '/' + game
45 |         a = np.load(path + '.npy')
46 |         a_mean, a_err = get_mean_and_confidence(a)
47 |         for i, idx in enumerate(game_ids):
48 |             ax.plot(a_mean[idx], linewidth=3)
49 |             ax.fill_between(np.arange(101), a_mean[idx] - a_err[idx], a_mean[idx] + a_err[idx], alpha=.5)
50 |     
51 |     for u in unfreezes:
52 |         for i, g in zip(game_ids, games):
53 |             if u == 101:
54 |                 legend_items.append('No unfreeze')
55 |             else:
56 |                 legend_items.append('Unfreeze-' + str(u))
57 |             file_path = alg + '/transfer' + '/' + g + '/unfreeze' + str(u) + '-' + r + '-' + act + '.npy'
58 | 
59 |             a = np.load(file_path)
60 |             a_mean, a_err = get_mean_and_confidence(a)
61 |             ax.plot(a_mean[0], linewidth=3)
62 |             ax.fill_between(np.arange(101), a_mean[0] - a_err[0], a_mean[0] + a_err[0], alpha=.5)
63 | 
64 | plt.xlabel('#Epochs', fontsize=35)
65 | plt.ylabel('Performance', fontsize=35)
66 | plt.xticks([0,50,100], fontsize=35)
67 | plt.yticks(fontsize=35)
68 | 
69 | plt.grid()
70 | 
71 | plt.title(title, fontsize=35)
72 | 
73 | plt.legend(legend_items, fontsize=35, loc='best')
74 | 
75 | plt.show()
76 | 


--------------------------------------------------------------------------------
/results/dqn/dqn/noreg-sigmoid/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/dqn/noreg-sigmoid/scores.npy


--------------------------------------------------------------------------------
/results/dqn/multidqn/noreg-sigmoid/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/multidqn/noreg-sigmoid/scores.npy


--------------------------------------------------------------------------------
/results/dqn/scores_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats as st
 3 | from matplotlib import pyplot as plt
 4 | 
 5 | 
 6 | def get_mean_and_confidence(data):
 7 |     mean = np.mean(data, axis=0)
 8 |     se = st.sem(data, axis=0)
 9 |     n = len(data)
10 | 
11 |     interval, _ = st.t.interval(0.95, n-1, scale=se)
12 | 
13 |     return mean, interval
14 | 
15 | folders = ['dqn', 'multidqn']
16 | games = ['Cart-Pole', 'Acrobot', 'Mountain-Car', 'Car-On-Hill', 'Inverted-Pendulum']
17 | reg = ['noreg']
18 | activation = ['sigmoid']
19 | n_games = len(games)
20 | n_settings = len(reg) * len(activation)
21 | 
22 | # plt.suptitle('DQN VS MULTI')
23 | 
24 | for i, g in enumerate(games):
25 |     j = 1
26 |     for act in activation:
27 |         for r in reg:
28 |             s = r + '-' + act
29 |             plt.subplot(n_settings, n_games, i * n_settings + j)
30 |             plt.title(g, fontsize=20)
31 |             
32 |             single = np.load('dqn/' + s + '/scores.npy')[:, i]
33 |             single_mean, single_err = get_mean_and_confidence(single)
34 |             
35 |             multi = np.load('multidqn/' + s + '/scores.npy')[:, i]
36 |             multi_mean, multi_err = get_mean_and_confidence(multi)
37 |             
38 |             plt.plot(single_mean, linewidth=3)
39 |             plt.fill_between(np.arange(51), single_mean - single_err, single_mean + single_err, alpha=.5)
40 |             
41 |             plt.plot(multi_mean, linewidth=3)
42 |             plt.fill_between(np.arange(51), multi_mean - multi_err, multi_mean + multi_err, alpha=.5)
43 | 
44 |             plt.xlabel('#Epochs', fontsize=20)
45 | 
46 |             plt.xticks([0, 25, 50], fontsize=20)
47 |             plt.yticks(fontsize=20)
48 | 
49 |             if i == 0:
50 |                 plt.ylabel('Performance', fontsize=20)
51 |             
52 |             plt.grid()
53 |             
54 |             # plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
55 |             
56 |             j += 1
57 | 
58 | plt.legend(['DQN', 'MULTI'], fontsize=20, loc='lower right')
59 | 
60 | plt.show()
61 | 
62 | 


--------------------------------------------------------------------------------
/results/dqn/transfer-plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats as st
 3 | from matplotlib import pyplot as plt
 4 | 
 5 | 
 6 | def get_mean_and_confidence(data):
 7 |     mean = np.mean(data, axis=0)
 8 |     se = st.sem(data, axis=0)
 9 |     n = len(data)
10 | 
11 |     interval, _ = st.t.interval(0.95, n-1, scale=se)
12 | 
13 |     return mean, interval
14 | 
15 | games = ['acro-noreg']
16 | game_ids = [1]
17 | reg = ['noreg']
18 | activation = ['sigmoid']
19 | n_games = len(games)
20 | unfreezes = [0, 10, 51]
21 | 
22 | legend_items = list()
23 | 
24 | fig, ax = plt.subplots(n_games, 1)
25 | # for i, g in enumerate(games):
26 | #     ax.set_title(g)
27 | #     ax.grid()
28 | 
29 | for act in activation:
30 |     for r in reg:
31 |         legend_items.append('No initialization')
32 |         path = 'dqn/' + r + '-' + act + '/'
33 |         a = np.load(path + 'scores.npy')
34 |         a_mean, a_err = get_mean_and_confidence(a)
35 |         for i, idx in enumerate(game_ids):
36 |             ax.plot(a_mean[idx], linewidth=3)
37 |             ax.fill_between(np.arange(51), a_mean[idx] - a_err[idx], a_mean[idx] + a_err[idx], alpha=.5)
38 |     
39 |     for u in unfreezes:
40 |         for i, g in zip(game_ids, games):
41 |             if u == 51:
42 |                 legend_items.append('No unfreeze')
43 |             else:
44 |                 legend_items.append('Unfreeze-' + str(u))
45 |             file_path = 'transfer' + '/' + g + '/unfreeze' + str(u) + '-' + r + '-' + act + '.npy'
46 | 
47 |             a = np.load(file_path)
48 |             a_mean, a_err = get_mean_and_confidence(a)
49 |             ax.plot(a_mean[0], linewidth=3)
50 |             ax.fill_between(np.arange(51), a_mean[0] - a_err[0], a_mean[0] + a_err[0], alpha=.5)
51 | 
52 | plt.xlabel('#Epochs', fontsize=35)
53 | plt.ylabel('Performance', fontsize=35)
54 | plt.xticks([0, 25, 50], fontsize=35)
55 | plt.yticks(fontsize=35)
56 | plt.title('Acrobot', fontsize=35)
57 | 
58 | plt.grid()
59 | 
60 | plt.legend(legend_items, fontsize=25, loc='lower right')
61 | 
62 | plt.show()
63 |     
64 | 


--------------------------------------------------------------------------------
/results/dqn/transfer/acro-noreg/noreg-cart_mc_coh_pend.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/transfer/acro-noreg/noreg-cart_mc_coh_pend.pkl


--------------------------------------------------------------------------------
/results/dqn/transfer/acro-noreg/unfreeze0-noreg-sigmoid.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/transfer/acro-noreg/unfreeze0-noreg-sigmoid.npy


--------------------------------------------------------------------------------
/results/dqn/transfer/acro-noreg/unfreeze10-noreg-sigmoid.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/transfer/acro-noreg/unfreeze10-noreg-sigmoid.npy


--------------------------------------------------------------------------------
/results/dqn/transfer/acro-noreg/unfreeze51-noreg-sigmoid.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/transfer/acro-noreg/unfreeze51-noreg-sigmoid.npy


--------------------------------------------------------------------------------
/results/fqi/0.800_4.000/avi_diff.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/0.800_4.000/avi_diff.npy


--------------------------------------------------------------------------------
/results/fqi/0.800_4.000/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/0.800_4.000/scores.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.000/avi_diff.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.000/avi_diff.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.000/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.000/scores.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.0000.800_4.000/avi_diff.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.000/avi_diff.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.0000.800_4.000/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.000/scores.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.500/avi_diff.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.500/avi_diff.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.500/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.500/scores.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000/avi_diff.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000/avi_diff.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000/scores.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.500/avi_diff.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.500/avi_diff.npy


--------------------------------------------------------------------------------
/results/fqi/1.000_4.500/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.500/scores.npy


--------------------------------------------------------------------------------
/results/fqi/1.200_4.500/avi_diff.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.200_4.500/avi_diff.npy


--------------------------------------------------------------------------------
/results/fqi/1.200_4.500/scores.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.200_4.500/scores.npy


--------------------------------------------------------------------------------
/results/fqi/avi_scores_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats as st
 3 | from matplotlib import pyplot as plt
 4 | 
 5 | 
 6 | def get_mean_and_confidence(data):
 7 |     mean = np.mean(data, axis=0)
 8 |     se = st.sem(data, axis=0)
 9 |     n = len(data)
10 | 
11 |     interval, _ = st.t.interval(0.95, n - 1, scale=se)
12 | 
13 |     return mean, interval
14 | 
15 | games = ['1.000_4.000', '0.800_4.000', '1.000_4.500', '1.200_4.500']
16 | 
17 | plt.subplot(1, 2, 1)
18 | a = list()
19 | for g in games:
20 |     a.append(np.load(g + '/avi_diff.npy'))
21 | a = np.array(a)
22 | 
23 | fs = 25
24 | 
25 | a_mean, a_err = get_mean_and_confidence(a.mean(0))
26 | plt.ylabel(r'$\Vert Q^* - Q^{\pi_K}\Vert$', fontsize=fs)
27 | plt.xlabel('# Iterations', fontsize=fs)
28 | plt.xticks([0, 25, 50], fontsize=fs)
29 | plt.yticks(fontsize=fs)
30 | plt.plot(a_mean, linewidth=3)
31 | plt.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5)
32 |     
33 | a = np.load(''.join(games) + '/avi_diff.npy')
34 | 
35 | a_mean, a_err = get_mean_and_confidence(a)
36 | plt.plot(a_mean, linewidth=3)
37 | plt.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5)
38 | plt.grid()
39 | plt.legend(['FQI', 'MULTI'], fontsize=fs)
40 | 
41 | plt.subplot(1, 2, 2)
42 | a = list()
43 | for g in games:
44 |     a.append(np.load(g + '/scores.npy'))
45 | a = np.array(a)
46 | 
47 | a_mean, a_err = get_mean_and_confidence(a.mean(0))
48 | plt.ylabel('Performance', fontsize=fs)
49 | plt.xlabel('# Iterations', fontsize=fs)
50 | plt.xticks([0, 25, 50], fontsize=fs)
51 | plt.yticks(fontsize=fs)
52 | plt.plot(a_mean, linewidth=3)
53 | plt.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5)
54 |     
55 | a = np.load(''.join(games) + '/scores.npy')
56 | 
57 | a_mean, a_err = get_mean_and_confidence(a)
58 | plt.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5)
59 | plt.plot(a_mean, linewidth=3)
60 | plt.grid()
61 |     
62 | plt.show()
63 |     
64 | 


--------------------------------------------------------------------------------
/results/fqi/multi_avi_scores_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats as st
 3 | from matplotlib import pyplot as plt
 4 | from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
 5 | from mpl_toolkits.axes_grid1.inset_locator import mark_inset
 6 | 
 7 | 
 8 | def get_mean_and_confidence(data):
 9 |     mean = np.mean(data, axis=0)
10 |     se = st.sem(data, axis=0)
11 |     n = len(data)
12 | 
13 |     interval, _ = st.t.interval(0.95, n - 1, scale=se)
14 | 
15 |     return mean, interval
16 | 
17 | games = ['1.000_4.000', '0.800_4.000', '1.000_4.500', '1.200_4.500', '1.000_4.125', '1.000_4.250', '1.000_4.375', '0.850_4.000']
18 | n_tasks = [1, 2, 4, 8]
19 | style = ['-', '-.', '--', ':']
20 | 
21 | fig, ax = plt.subplots()
22 | for j, i in enumerate(n_tasks):
23 |     a = np.load(''.join(games[:i]) + '/avi_diff.npy')
24 | 
25 |     a_mean, a_err = get_mean_and_confidence(a)
26 |     ax.plot(a_mean, linewidth=3, linestyle=style[j])
27 |     ax.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5)
28 |     
29 | fs = 25
30 |     
31 | plt.xticks([0, 25, 50], fontsize=fs)
32 | plt.yticks(fontsize=fs)
33 | plt.ylabel(r'$\Vert Q^* - Q^{\pi_K}\Vert$', fontsize=fs)
34 | plt.xlabel('# Iterations', fontsize=fs)
35 | plt.grid()
36 | plt.legend(n_tasks, fontsize=fs)
37 | 
38 | axins = zoomed_inset_axes(ax, 2, loc=9) # zoom-factor: 2.5, location: upper-left
39 | mark_inset(ax, axins, loc1=4, loc2=3, fc="none", ec="0.5")
40 | for j, i in enumerate(n_tasks):
41 |     a = np.load(''.join(games[:i]) + '/avi_diff.npy')
42 |     
43 |     a_mean, a_err = get_mean_and_confidence(a)
44 |     axins.plot(a_mean, linewidth=3, linestyle=style[j])
45 |     axins.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5)
46 |     x1, x2, y1, y2 = 40, 49, .155, .225 # specify the limits
47 |     axins.set_xlim(x1, x2) # apply the x-limits
48 |     axins.set_ylim(y1, y2) # apply the y-limits
49 | axins.grid()
50 | axins.set_xticks([])
51 | axins.set_yticks([])
52 |     
53 | plt.show()
54 | 
55 | 


--------------------------------------------------------------------------------