├── .gitignore ├── README.md ├── core.py ├── ddpg ├── ddpg.py ├── losses.py ├── networks.py ├── run_bullet.py └── run_mujoco.py ├── dqn ├── dqn.py ├── losses.py ├── networks.py ├── run_atari.py └── run_gym.py ├── fqi ├── car_on_hill.py ├── dataset_0.800_4.000.pkl ├── dataset_0.850_4.000.pkl ├── dataset_0.900_4.000.pkl ├── dataset_0.950_4.000.pkl ├── dataset_1.000_4.000.pkl ├── dataset_1.000_4.125.pkl ├── dataset_1.000_4.250.pkl ├── dataset_1.000_4.375.pkl ├── dataset_1.000_4.500.pkl ├── dataset_1.050_4.500.pkl ├── dataset_1.100_4.500.pkl ├── dataset_1.150_4.500.pkl ├── dataset_1.200_4.375.pkl ├── dataset_1.200_4.500.pkl ├── dataset_1.200_4.625.pkl ├── dataset_1.200_4.750.pkl ├── fqi.py ├── losses.py ├── networks.py ├── run_coh.py ├── solver.py ├── test_q_0.800_4.000.npy ├── test_q_0.850_4.000.npy ├── test_q_0.900_4.000.npy ├── test_q_0.950_4.000.npy ├── test_q_1.000_4.000.npy ├── test_q_1.000_4.125.npy ├── test_q_1.000_4.250.npy ├── test_q_1.000_4.375.npy ├── test_q_1.000_4.500.npy ├── test_q_1.050_4.500.npy ├── test_q_1.100_4.500.npy ├── test_q_1.150_4.500.npy ├── test_q_1.200_4.375.npy ├── test_q_1.200_4.500.npy ├── test_q_1.200_4.625.npy └── test_q_1.200_4.750.npy ├── policy.py ├── replay_memory.py └── results ├── ddpg ├── multi_pendulum │ ├── noreg-sigmoid │ │ └── scores.npy │ └── transfer │ │ └── noreg │ │ ├── unfreeze0-noreg-sigmoid.npy │ │ ├── unfreeze101-noreg-sigmoid.npy │ │ └── w.pkl ├── multi_walker │ ├── noreg-sigmoid │ │ └── scores.npy │ └── transfer │ │ └── noreg │ │ ├── unfreeze0-noreg-sigmoid.npy │ │ ├── unfreeze101-noreg-sigmoid.npy │ │ └── w.pkl ├── scores-plot.py ├── single │ └── noreg-sigmoid │ │ ├── InvertedDoublePendulumBulletEnv-v0.npy │ │ ├── InvertedPendulumBulletEnv-v0.npy │ │ ├── InvertedPendulumSwingupBulletEnv-v0.npy │ │ ├── chee_run.npy │ │ ├── hop_stand.npy │ │ └── walk_walk.npy └── transfer-plot.py ├── dqn ├── dqn │ └── noreg-sigmoid │ │ └── scores.npy ├── multidqn │ └── noreg-sigmoid │ │ └── scores.npy ├── scores_plot.py ├── transfer-plot.py └── transfer │ └── acro-noreg │ ├── noreg-cart_mc_coh_pend.pkl │ ├── unfreeze0-noreg-sigmoid.npy │ ├── unfreeze10-noreg-sigmoid.npy │ └── unfreeze51-noreg-sigmoid.npy └── fqi ├── 0.800_4.000 ├── avi_diff.npy └── scores.npy ├── 1.000_4.000 ├── avi_diff.npy └── scores.npy ├── 1.000_4.0000.800_4.000 ├── avi_diff.npy └── scores.npy ├── 1.000_4.0000.800_4.0001.000_4.5001.200_4.500 ├── avi_diff.npy └── scores.npy ├── 1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000 ├── avi_diff.npy └── scores.npy ├── 1.000_4.500 ├── avi_diff.npy └── scores.npy ├── 1.200_4.500 ├── avi_diff.npy └── scores.npy ├── avi_scores_plot.py └── multi_avi_scores_plot.py /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | mushroom.egg-info/ 4 | .idea/ 5 | *.pyc 6 | *.xml 7 | logs/ 8 | *.h5 9 | */logs 10 | *_raw.npy 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Code of the experimental evaluation of ICLR2020 paper: "Sharing Knowledge in Multi-Task Deep Reinforcement Learning" (https://openreview.net/forum?id=rkgpv2VFvr). 2 | -------------------------------------------------------------------------------- /core.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | 4 | class Core(object): 5 | def __init__(self, agent, mdp, callbacks=None): 6 | self.agent = agent 7 | self.mdp = mdp 8 | self._n_mdp = len(self.mdp) 9 | self.callbacks = callbacks if callbacks is not None else list() 10 | 11 | self._state = [None for _ in range(self._n_mdp)] 12 | 13 | self._total_steps_counter = 0 14 | self._current_steps_counter = 0 15 | self._episode_steps = [None for _ in range(self._n_mdp)] 16 | self._n_steps_per_fit = None 17 | 18 | def learn(self, n_steps=None, n_steps_per_fit=None, render=False, 19 | quiet=False): 20 | self._n_steps_per_fit = n_steps_per_fit 21 | 22 | fit_condition = \ 23 | lambda: self._current_steps_counter >= self._n_steps_per_fit 24 | 25 | self._run(n_steps, fit_condition, render, quiet) 26 | 27 | def evaluate(self, n_steps=None, render=False, 28 | quiet=False): 29 | fit_condition = lambda: False 30 | 31 | return self._run(n_steps, fit_condition, render, quiet) 32 | 33 | def _run(self, n_steps, fit_condition, render, quiet): 34 | move_condition = lambda: self._total_steps_counter < n_steps 35 | 36 | steps_progress_bar = tqdm(total=n_steps, 37 | dynamic_ncols=True, disable=quiet, 38 | leave=False) 39 | 40 | return self._run_impl(move_condition, fit_condition, steps_progress_bar, 41 | render) 42 | 43 | def _run_impl(self, move_condition, fit_condition, steps_progress_bar, 44 | render): 45 | self._total_steps_counter = 0 46 | self._current_steps_counter = 0 47 | 48 | dataset = list() 49 | last = [True] * self._n_mdp 50 | while move_condition(): 51 | for i in range(self._n_mdp): 52 | if last[i]: 53 | self.reset(i) 54 | 55 | sample = self._step(i, render) 56 | dataset.append(sample) 57 | 58 | last[i] = sample[-1] 59 | 60 | self._total_steps_counter += 1 61 | self._current_steps_counter += 1 62 | steps_progress_bar.update(1) 63 | 64 | if fit_condition(): 65 | self.agent.fit(dataset) 66 | self._current_steps_counter = 0 67 | 68 | for c in self.callbacks: 69 | callback_pars = dict(dataset=dataset) 70 | c(**callback_pars) 71 | 72 | dataset = list() 73 | 74 | self.agent.stop() 75 | for i in range(self._n_mdp): 76 | self.mdp[i].stop() 77 | 78 | return dataset 79 | 80 | def _step(self, i, render): 81 | action = self.agent.draw_action([i, self._state[i]]) 82 | next_state, reward, absorbing, _ = self.mdp[i].step(action) 83 | 84 | self._episode_steps[i] += 1 85 | 86 | if render: 87 | self.mdp[i].render() 88 | 89 | last = not( 90 | self._episode_steps[i] < self.mdp[i].info.horizon and not absorbing) 91 | 92 | state = self._state[i] 93 | self._state[i] = next_state.copy() 94 | 95 | return [i, state], action, reward, [i, next_state], absorbing, last 96 | 97 | def reset(self, i): 98 | self._state[i] = self.mdp[i].reset().copy() 99 | self.agent.episode_start() 100 | self.agent.next_action = None 101 | self._episode_steps[i] = 0 102 | -------------------------------------------------------------------------------- /ddpg/ddpg.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import numpy as np 4 | 5 | import torch.nn as nn 6 | from mushroom_rl.algorithms import Agent 7 | from mushroom_rl.approximators import Regressor 8 | 9 | from replay_memory import ReplayMemory 10 | 11 | 12 | class ActorLoss(nn.Module): 13 | def __init__(self, critic): 14 | super().__init__() 15 | 16 | self._critic = critic 17 | 18 | def forward(self, arg, state, idxs): 19 | action = arg 20 | 21 | q = self._critic.model.network(state, action, idx=idxs) 22 | 23 | return -q.mean() 24 | 25 | 26 | class DDPG(Agent): 27 | def __init__(self, actor_approximator, critic_approximator, policy_class, 28 | mdp_info, batch_size, initial_replay_size, max_replay_size, 29 | tau, actor_params, critic_params, policy_params, 30 | n_actions_per_head, history_length=1, n_input_per_mdp=None, 31 | n_games=1, dtype=np.uint8): 32 | self._batch_size = batch_size 33 | self._n_games = n_games 34 | if n_input_per_mdp is None: 35 | self._n_input_per_mdp = [mdp_info.observation_space.shape 36 | for _ in range(self._n_games)] 37 | else: 38 | self._n_input_per_mdp = n_input_per_mdp 39 | self._n_actions_per_head = n_actions_per_head 40 | self._max_actions = max(n_actions_per_head)[0] 41 | self._history_length = history_length 42 | self._tau = tau 43 | 44 | self._replay_memory = [ 45 | ReplayMemory(initial_replay_size, 46 | max_replay_size) for _ in range(self._n_games) 47 | ] 48 | 49 | self._n_updates = 0 50 | 51 | target_critic_params = deepcopy(critic_params) 52 | self._critic_approximator = Regressor(critic_approximator, 53 | **critic_params) 54 | self._target_critic_approximator = Regressor(critic_approximator, 55 | **target_critic_params) 56 | 57 | if 'loss' not in actor_params: 58 | actor_params['loss'] = ActorLoss(self._critic_approximator) 59 | 60 | target_actor_params = deepcopy(actor_params) 61 | self._actor_approximator = Regressor(actor_approximator, 62 | n_fit_targets=2, **actor_params) 63 | self._target_actor_approximator = Regressor(actor_approximator, 64 | n_fit_targets=2, 65 | **target_actor_params) 66 | 67 | self._target_actor_approximator.model.set_weights( 68 | self._actor_approximator.model.get_weights()) 69 | self._target_critic_approximator.model.set_weights( 70 | self._critic_approximator.model.get_weights()) 71 | 72 | policy = policy_class(self._actor_approximator, **policy_params) 73 | 74 | super().__init__(mdp_info, policy) 75 | 76 | n_samples = self._batch_size * self._n_games 77 | self._state_idxs = np.zeros(n_samples, dtype=np.int) 78 | self._state = np.zeros( 79 | ((n_samples, 80 | self._history_length) + self.mdp_info.observation_space.shape), 81 | dtype=dtype 82 | ).squeeze() 83 | self._action = np.zeros((n_samples, self._max_actions)) 84 | self._reward = np.zeros(n_samples) 85 | self._next_state_idxs = np.zeros(n_samples, dtype=np.int) 86 | self._next_state = np.zeros( 87 | ((n_samples, 88 | self._history_length) + self.mdp_info.observation_space.shape), 89 | dtype=dtype 90 | ).squeeze() 91 | self._absorbing = np.zeros(n_samples) 92 | 93 | def fit(self, dataset): 94 | s = np.array([d[0][0] for d in dataset]).ravel() 95 | games = np.unique(s) 96 | for g in games: 97 | idxs = np.argwhere(s == g).ravel() 98 | d = list() 99 | for idx in idxs: 100 | d.append(dataset[idx]) 101 | 102 | self._replay_memory[g].add(d) 103 | 104 | fit_condition = np.all([rm.initialized for rm in self._replay_memory]) 105 | 106 | if fit_condition: 107 | for i in range(len(self._replay_memory)): 108 | game_state, game_action, game_reward, game_next_state,\ 109 | game_absorbing, _ = self._replay_memory[i].get( 110 | self._batch_size) 111 | 112 | start = self._batch_size * i 113 | stop = start + self._batch_size 114 | 115 | self._state_idxs[start:stop] = np.ones(self._batch_size) * i 116 | self._state[start:stop, :self._n_input_per_mdp[i][0]] = game_state 117 | self._action[start:stop, :self._n_actions_per_head[i][0]] = game_action 118 | self._reward[start:stop] = game_reward 119 | self._next_state_idxs[start:stop] = np.ones(self._batch_size) * i 120 | self._next_state[start:stop, :self._n_input_per_mdp[i][0]] = game_next_state 121 | self._absorbing[start:stop] = game_absorbing 122 | 123 | q_next = self._next_q() 124 | q = self._reward + q_next 125 | 126 | self._critic_approximator.fit(self._state, self._action, q, 127 | idx=self._state_idxs) 128 | self._actor_approximator.fit(self._state, self._state, 129 | self._state_idxs, 130 | idx=self._state_idxs) 131 | 132 | self._n_updates += 1 133 | 134 | self._update_target() 135 | 136 | def get_shared_weights(self): 137 | cw = self._critic_approximator.model.network.get_shared_weights() 138 | aw = self._actor_approximator.model.network.get_shared_weights() 139 | 140 | return [cw, aw] 141 | 142 | def set_shared_weights(self, weights): 143 | self._critic_approximator.model.network.set_shared_weights(weights[0]) 144 | self._actor_approximator.model.network.set_shared_weights(weights[1]) 145 | 146 | def freeze_shared_weights(self): 147 | self._critic_approximator.model.network.freeze_shared_weights() 148 | self._actor_approximator.model.network.freeze_shared_weights() 149 | 150 | def unfreeze_shared_weights(self): 151 | self._critic_approximator.model.network.unfreeze_shared_weights() 152 | self._actor_approximator.model.network.unfreeze_shared_weights() 153 | 154 | def _update_target(self): 155 | """ 156 | Update the target networks. 157 | 158 | """ 159 | critic_weights = self._tau * self._critic_approximator.model.get_weights() 160 | critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights() 161 | self._target_critic_approximator.set_weights(critic_weights) 162 | 163 | actor_weights = self._tau * self._actor_approximator.model.get_weights() 164 | actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights() 165 | self._target_actor_approximator.set_weights(actor_weights) 166 | 167 | def _next_q(self): 168 | a = self._target_actor_approximator(self._next_state, 169 | idx=self._next_state_idxs) 170 | q = self._target_critic_approximator(self._next_state, a, 171 | idx=self._next_state_idxs).ravel() 172 | 173 | out_q = np.zeros(self._batch_size * self._n_games) 174 | for i in range(self._n_games): 175 | start = self._batch_size * i 176 | stop = start + self._batch_size 177 | 178 | out_q[start:stop] = q[start:stop] * self.mdp_info.gamma[i] 179 | if np.any(self._absorbing[start:stop]): 180 | out_q[start:stop] = out_q[start:stop] * ( 181 | 1 - self._absorbing[start:stop] 182 | ) 183 | 184 | return out_q 185 | -------------------------------------------------------------------------------- /ddpg/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | class LossFunction(object): 6 | def __init__(self, n_games, batch_size, eval_frequency): 7 | self._n_games = n_games 8 | self._batch_size = batch_size 9 | self._eval_frequency = eval_frequency 10 | 11 | self._losses = list() 12 | self._counter = 0 13 | 14 | def get_losses(self): 15 | return self._losses 16 | 17 | def __call__(self, yhat, y): 18 | loss = F.smooth_l1_loss(yhat, y, reduce=False) 19 | 20 | if self._need_log(): 21 | temp_losses = list() 22 | 23 | for i in range(self._n_games): 24 | start = i * self._batch_size 25 | stop = start + self._batch_size 26 | temp_losses.append(torch.mean(loss[start:stop]).item()) 27 | 28 | self._losses.append(temp_losses) 29 | 30 | loss = torch.mean(loss) 31 | 32 | return loss 33 | 34 | def _need_log(self): 35 | self._counter += 1 36 | if self._counter >= self._eval_frequency: 37 | self._counter = 0 38 | return True 39 | else: 40 | return False 41 | -------------------------------------------------------------------------------- /ddpg/networks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | 7 | class ActorNetwork(nn.Module): 8 | def __init__(self, input_shape, _, n_actions_per_head, n_hidden_1, 9 | n_hidden_2, use_cuda, features, dropout): 10 | super().__init__() 11 | 12 | self._n_input = input_shape 13 | self._n_games = len(n_actions_per_head) 14 | self._max_actions = max(n_actions_per_head)[0] 15 | self._use_cuda = use_cuda 16 | self._features = features 17 | self._n_shared = 2 18 | 19 | self._h1 = nn.ModuleList( 20 | [nn.Linear(self._n_input[i][0], n_hidden_1) for i in range( 21 | len(input_shape))] 22 | ) 23 | self._h2 = nn.Linear(n_hidden_1, n_hidden_2) 24 | self._h3 = nn.ModuleList( 25 | [nn.Linear(n_hidden_2, self._max_actions) for _ in range( 26 | self._n_games)] 27 | ) 28 | 29 | fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._h2.weight) 30 | nn.init.uniform_(self._h2.weight, a=-1 / np.sqrt(fan_in), 31 | b=1 / np.sqrt(fan_in)) 32 | for i in range(self._n_games): 33 | fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._h1[i].weight) 34 | nn.init.uniform_(self._h1[i].weight, a=-1 / np.sqrt(fan_in), 35 | b=1 / np.sqrt(fan_in)) 36 | nn.init.uniform_(self._h3[i].weight, a=-3e-3, b=3e-3) 37 | nn.init.uniform_(self._h3[i].bias, a=-3e-3, b=3e-3) 38 | 39 | def forward(self, state, idx=None, get_features=False): 40 | state = state.float() 41 | 42 | h1 = list() 43 | for i in np.unique(idx): 44 | idxs = np.argwhere(idx == i).ravel() 45 | h1.append(F.relu(self._h1[i](state[idxs, :self._n_input[i][0]]))) 46 | cat_h1 = torch.cat(h1) 47 | 48 | h_f = F.relu(self._h2(cat_h1)) 49 | 50 | a = [torch.tanh(self._h3[i](h_f)) for i in range(self._n_games)] 51 | a = torch.stack(a, dim=1) 52 | 53 | if idx is not None: 54 | idx = torch.from_numpy(idx) 55 | if self._use_cuda: 56 | idx = idx.cuda() 57 | a_idx = a.gather(1, idx.view(-1, 1).repeat( 58 | 1, self._max_actions).unsqueeze(1) 59 | ) 60 | 61 | a = torch.squeeze(a_idx, 1) 62 | 63 | if get_features: 64 | return a, h_f 65 | else: 66 | return a 67 | 68 | def get_shared_weights(self): 69 | p2 = list() 70 | 71 | for p in self._h2.parameters(): 72 | p2.append(p.data.detach().cpu().numpy()) 73 | 74 | return p2 75 | 76 | def set_shared_weights(self, weights): 77 | w2 = weights 78 | 79 | for p, w in zip(self._h2.parameters(), w2): 80 | w_tensor = torch.from_numpy(w).type(p.data.dtype) 81 | if self._use_cuda: 82 | w_tensor = w_tensor.cuda() 83 | p.data = w_tensor 84 | 85 | def freeze_shared_weights(self): 86 | for p in self._h2.parameters(): 87 | p.requires_grad = False 88 | 89 | def unfreeze_shared_weights(self): 90 | for p in self._h2.parameters(): 91 | p.requires_grad = True 92 | 93 | 94 | class CriticNetwork(nn.Module): 95 | def __init__(self, input_shape, _, n_actions_per_head, n_hidden_1, 96 | n_hidden_2, use_cuda, features, dropout): 97 | super().__init__() 98 | 99 | self._n_input = input_shape 100 | self._n_games = len(n_actions_per_head) 101 | self._max_actions = max(n_actions_per_head)[0] 102 | self._n_actions_per_head = n_actions_per_head 103 | self._use_cuda = use_cuda 104 | self._features = features 105 | self._n_shared = 2 106 | 107 | self._h1 = nn.ModuleList( 108 | [nn.Linear(self._n_input[i][0], n_hidden_1) for i in range( 109 | len(input_shape))] 110 | ) 111 | self._h2_s = nn.Linear(n_hidden_1, n_hidden_2) 112 | self._h3 = nn.ModuleList( 113 | [nn.Linear(n_hidden_2, 1) for _ in range( 114 | self._n_games)] 115 | ) 116 | self._h2_a = nn.ModuleList( 117 | [nn.Linear(n_actions_per_head[i][0], n_hidden_2, bias=False) for i in range( 118 | len(n_actions_per_head))] 119 | ) 120 | 121 | fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._h2_s.weight) 122 | nn.init.uniform_(self._h2_s.weight, a=-1 / np.sqrt(fan_in), 123 | b=1 / np.sqrt(fan_in)) 124 | for i in range(self._n_games): 125 | fan_in, _ = nn.init._calculate_fan_in_and_fan_out( 126 | self._h2_a[i].weight) 127 | nn.init.uniform_(self._h2_a[i].weight, a=-1 / np.sqrt(fan_in), 128 | b=1 / np.sqrt(fan_in)) 129 | fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._h1[i].weight) 130 | nn.init.uniform_(self._h1[i].weight, a=-1 / np.sqrt(fan_in), 131 | b=1 / np.sqrt(fan_in)) 132 | nn.init.uniform_(self._h3[i].weight, a=-3e-3, b=3e-3) 133 | nn.init.uniform_(self._h3[i].bias, a=-3e-3, b=3e-3) 134 | 135 | def forward(self, state, action, idx=None): 136 | state = state.float() 137 | action = action.float() 138 | if not isinstance(idx, np.ndarray): 139 | idx = idx.cpu().numpy().astype(np.int) 140 | 141 | h2 = list() 142 | for i in np.unique(idx): 143 | idxs = np.argwhere(idx == i).ravel() 144 | h1 = F.relu(self._h1[i](state[idxs, :self._n_input[i][0]])) 145 | a = action[idxs, :self._n_actions_per_head[i][0]] 146 | h2.append(self._h2_s(h1) + self._h2_a[i](a)) 147 | 148 | cat_h2 = torch.cat(h2) 149 | 150 | if self._features == 'relu': 151 | h_f = F.relu(cat_h2) 152 | elif self._features == 'sigmoid': 153 | h_f = torch.sigmoid(cat_h2) 154 | else: 155 | raise ValueError 156 | 157 | q = [self._h3[i](h_f) for i in range(self._n_games)] 158 | q = torch.stack(q, dim=1).squeeze(-1) 159 | 160 | if idx is not None: 161 | idx = torch.from_numpy(idx) 162 | if self._use_cuda: 163 | idx = idx.cuda() 164 | 165 | q_idx = q.gather(1, idx.unsqueeze(-1)) 166 | q = torch.squeeze(q_idx, 1) 167 | 168 | return q 169 | 170 | def get_shared_weights(self): 171 | p2 = list() 172 | 173 | for p in self._h2_s.parameters(): 174 | p2.append(p.data.detach().cpu().numpy()) 175 | 176 | return p2 177 | 178 | def set_shared_weights(self, weights): 179 | w2 = weights 180 | 181 | for p, w in zip(self._h2_s.parameters(), w2): 182 | w_tensor = torch.from_numpy(w).type(p.data.dtype) 183 | if self._use_cuda: 184 | w_tensor = w_tensor.cuda() 185 | p.data = w_tensor 186 | 187 | def freeze_shared_weights(self): 188 | for p in self._h2_s.parameters(): 189 | p.requires_grad = False 190 | 191 | def unfreeze_shared_weights(self): 192 | for p in self._h2_s.parameters(): 193 | p.requires_grad = True 194 | -------------------------------------------------------------------------------- /ddpg/run_bullet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import pathlib 4 | import sys 5 | 6 | from joblib import delayed, Parallel 7 | import numpy as np 8 | import torch.optim as optim 9 | 10 | import pickle 11 | 12 | sys.path.append('..') 13 | 14 | from mushroom_rl.approximators.parametric import TorchApproximator 15 | from mushroom_rl.environments import * 16 | from mushroom_rl.utils.dataset import compute_J 17 | 18 | from core import Core 19 | from ddpg import DDPG 20 | from policy import OrnsteinUhlenbeckPolicy 21 | 22 | from networks import ActorNetwork, CriticNetwork 23 | from losses import LossFunction 24 | 25 | 26 | def print_epoch(epoch): 27 | print('################################################################') 28 | print('Epoch: ', epoch) 29 | print('----------------------------------------------------------------') 30 | 31 | 32 | def get_stats(dataset, gamma, idx, domains): 33 | J = np.mean(compute_J(dataset, gamma[idx])) 34 | print(domains[idx] + ': J: %f' % J) 35 | 36 | return J 37 | 38 | 39 | def experiment(idx, args): 40 | np.random.seed() 41 | 42 | domains = [''.join(g) for g in args.games] 43 | 44 | scores = list() 45 | for _ in range(len(domains)): 46 | scores.append(list()) 47 | 48 | optimizer_actor = dict() 49 | optimizer_actor['class'] = optim.Adam 50 | optimizer_actor['params'] = dict(lr=args.learning_rate_actor) 51 | 52 | optimizer_critic = dict() 53 | optimizer_critic['class'] = optim.Adam 54 | optimizer_critic['params'] = dict(lr=args.learning_rate_critic, 55 | weight_decay=1e-2) 56 | 57 | # MDP 58 | mdp = list() 59 | gamma_eval = list() 60 | for i, g in enumerate(domains): 61 | mdp.append(Gym(g, args.horizon[i], args.gamma[i])) 62 | gamma_eval.append(args.gamma[i]) 63 | if args.render: 64 | mdp[0].render(mode='human') 65 | 66 | n_input_per_mdp = [m.info.observation_space.shape for m in mdp] 67 | n_actions_per_head = [(m.info.action_space.shape[0],) for m in mdp] 68 | 69 | max_obs_dim = 0 70 | max_act_n = 0 71 | for i in range(len(domains)): 72 | n = mdp[i].info.observation_space.shape[0] 73 | m = len(mdp[i].info.action_space.shape) 74 | if n > max_obs_dim: 75 | max_obs_dim = n 76 | max_obs_idx = i 77 | if m > max_act_n: 78 | max_act_n = m 79 | max_act_idx = i 80 | gammas = [m.info.gamma for m in mdp] 81 | horizons = [m.info.horizon for m in mdp] 82 | mdp_info = MDPInfo(mdp[max_obs_idx].info.observation_space, 83 | mdp[max_act_idx].info.action_space, gammas, horizons) 84 | max_action_value = list() 85 | for m in mdp: 86 | assert len(np.unique(m.info.action_space.low)) == 1 87 | assert len(np.unique(m.info.action_space.high)) == 1 88 | assert abs(m.info.action_space.low[0]) == m.info.action_space.high[0] 89 | 90 | max_action_value.append(m.info.action_space.high[0]) 91 | 92 | # DQN learning run 93 | 94 | # Settings 95 | if args.debug: 96 | initial_replay_size = args.batch_size 97 | max_replay_size = 500 98 | test_samples = 20 99 | evaluation_frequency = 50 100 | max_steps = 1000 101 | else: 102 | initial_replay_size = args.initial_replay_size 103 | max_replay_size = args.max_replay_size 104 | test_samples = args.test_samples 105 | evaluation_frequency = args.evaluation_frequency 106 | max_steps = args.max_steps 107 | 108 | # Policy 109 | policy_class = OrnsteinUhlenbeckPolicy 110 | policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2, 111 | n_actions_per_head=n_actions_per_head, 112 | max_action_value=max_action_value) 113 | 114 | # Approximator 115 | n_games = len(args.games) 116 | loss = LossFunction(n_games, args.batch_size, evaluation_frequency) 117 | 118 | actor_approximator = TorchApproximator 119 | actor_input_shape = [m.info.observation_space.shape for m in mdp] 120 | 121 | actor_approximator_params = dict( 122 | network=ActorNetwork, 123 | input_shape=actor_input_shape, 124 | output_shape=(max(n_actions_per_head)[0],), 125 | n_actions_per_head=n_actions_per_head, 126 | n_hidden_1=args.hidden_neurons[0], 127 | n_hidden_2=args.hidden_neurons[1], 128 | optimizer=optimizer_actor, 129 | use_cuda=args.use_cuda, 130 | features=args.features 131 | ) 132 | 133 | critic_approximator = TorchApproximator 134 | critic_input_shape = [m.info.observation_space.shape for m in mdp] 135 | critic_approximator_params = dict( 136 | network=CriticNetwork, 137 | input_shape=critic_input_shape, 138 | output_shape=(1,), 139 | n_actions_per_head=n_actions_per_head, 140 | n_hidden_1=args.hidden_neurons[0], 141 | n_hidden_2=args.hidden_neurons[1], 142 | optimizer=optimizer_actor, 143 | loss=loss, 144 | use_cuda=args.use_cuda, 145 | features=args.features 146 | ) 147 | 148 | # Agent 149 | algorithm_params = dict( 150 | batch_size=args.batch_size, 151 | initial_replay_size=initial_replay_size, 152 | max_replay_size=max_replay_size, 153 | tau=args.tau, 154 | actor_params=actor_approximator_params, 155 | critic_params=critic_approximator_params, 156 | policy_params=policy_params, 157 | n_games=len(domains), 158 | n_input_per_mdp=n_input_per_mdp, 159 | n_actions_per_head=n_actions_per_head, 160 | dtype=np.float32 161 | ) 162 | 163 | agent = DDPG(actor_approximator, critic_approximator, policy_class, 164 | mdp_info, **algorithm_params) 165 | 166 | # Algorithm 167 | core = Core(agent, mdp) 168 | 169 | # RUN 170 | 171 | # Fill replay memory with random dataset 172 | print_epoch(0) 173 | core.learn(n_steps=initial_replay_size, 174 | n_steps_per_fit=initial_replay_size, quiet=args.quiet) 175 | 176 | if args.transfer: 177 | weights = pickle.load(open(args.transfer, 'rb')) 178 | agent.set_shared_weights(weights) 179 | 180 | if args.load: 181 | weights = np.load(args.load) 182 | agent.policy.set_weights(weights) 183 | 184 | # Evaluate initial policy 185 | agent.policy.eval = True 186 | dataset = core.evaluate(n_steps=test_samples, render=args.render, 187 | quiet=args.quiet) 188 | agent.policy.eval = False 189 | for i in range(len(mdp)): 190 | d = dataset[i::len(mdp)] 191 | scores[i].append(get_stats(d, gamma_eval, i, domains)) 192 | 193 | if args.unfreeze_epoch > 0: 194 | agent.freeze_shared_weights() 195 | 196 | best_score_sum = -np.inf 197 | best_weights = None 198 | 199 | np.save(folder_name + 'scores-exp-%d.npy' % idx, scores) 200 | np.save(folder_name + 'critic_loss-exp-%d.npy' % idx, 201 | agent._critic_approximator.model._loss.get_losses()) 202 | for n_epoch in range(1, max_steps // evaluation_frequency + 1): 203 | if n_epoch >= args.unfreeze_epoch > 0: 204 | agent.unfreeze_shared_weights() 205 | 206 | print_epoch(n_epoch) 207 | print('- Learning:') 208 | # learning step 209 | core.learn(n_steps=evaluation_frequency, 210 | n_steps_per_fit=1, quiet=args.quiet) 211 | 212 | print('- Evaluation:') 213 | # evaluation step 214 | agent.policy.eval = True 215 | dataset = core.evaluate(n_steps=test_samples, 216 | render=args.render, quiet=args.quiet) 217 | agent.policy.eval = False 218 | 219 | current_score_sum = 0 220 | for i in range(len(mdp)): 221 | d = dataset[i::len(mdp)] 222 | current_score = get_stats(d, gamma_eval, i, domains) 223 | scores[i].append(current_score) 224 | current_score_sum += current_score 225 | 226 | # Save shared weights if best score 227 | if args.save_shared and current_score_sum >= best_score_sum: 228 | best_score_sum = current_score_sum 229 | best_weights = agent.get_shared_weights() 230 | 231 | if args.save: 232 | np.save(folder_name + 'best_weights-exp-%d.npy' % idx, 233 | agent.policy.get_weights()) 234 | 235 | np.save(folder_name + 'scores-exp-%d.npy' % idx, scores) 236 | np.save(folder_name + 'critic_loss-exp-%d.npy' % idx, 237 | agent._critic_approximator.model._loss.get_losses()) 238 | 239 | if args.save_shared: 240 | pickle.dump(best_weights, open(args.save_shared, 'wb')) 241 | 242 | return scores, agent._critic_approximator.model._loss.get_losses() 243 | 244 | 245 | if __name__ == '__main__': 246 | # Argument parser 247 | parser = argparse.ArgumentParser() 248 | 249 | arg_game = parser.add_argument_group('Game') 250 | arg_game.add_argument("--games", type=list, nargs='+', 251 | default=['AntBulletEnv-v0']) 252 | arg_game.add_argument("--horizon", type=int, nargs='+') 253 | arg_game.add_argument("--gamma", type=float, nargs='+') 254 | arg_game.add_argument("--n-exp", type=int) 255 | 256 | arg_mem = parser.add_argument_group('Replay Memory') 257 | arg_mem.add_argument("--initial-replay-size", type=int, default=64, 258 | help='Initial size of the replay memory.') 259 | arg_mem.add_argument("--max-replay-size", type=int, default=50000, 260 | help='Max size of the replay memory.') 261 | 262 | arg_net = parser.add_argument_group('Deep Q-Network') 263 | arg_net.add_argument("--hidden-neurons", type=int, nargs=2, 264 | default=[600, 500]) 265 | arg_net.add_argument("--learning-rate-actor", type=float, default=1e-4, 266 | help='Learning rate value of the optimizer. Only used' 267 | 'in rmspropcentered') 268 | arg_net.add_argument("--learning-rate-critic", type=float, default=1e-3, 269 | help='Learning rate value of the optimizer. Only used' 270 | 'in rmspropcentered') 271 | 272 | arg_alg = parser.add_argument_group('Algorithm') 273 | arg_alg.add_argument("--features", choices=['relu', 'sigmoid']) 274 | arg_alg.add_argument("--batch-size", type=int, default=64, 275 | help='Batch size for each fit of the network.') 276 | arg_alg.add_argument("--tau", type=float, default=1e-3) 277 | arg_alg.add_argument("--history-length", type=int, default=1, 278 | help='Number of frames composing a state.') 279 | arg_alg.add_argument("--evaluation-frequency", type=int, default=10000, 280 | help='Number of learning step before each evaluation.' 281 | 'This number represents an epoch.') 282 | arg_alg.add_argument("--max-steps", type=int, default=1000000, 283 | help='Total number of learning steps.') 284 | arg_alg.add_argument("--test-samples", type=int, default=5000, 285 | help='Number of steps for each evaluation.') 286 | arg_alg.add_argument("--transfer", type=str, default='', 287 | help='Path to the file of the weights of the common ' 288 | 'layers to be loaded') 289 | arg_alg.add_argument("--save-shared", type=str, default='', 290 | help='filename where to save the shared weights') 291 | arg_alg.add_argument("--unfreeze-epoch", type=int, default=0, 292 | help="Number of epoch where to unfreeze shared weights.") 293 | 294 | arg_utils = parser.add_argument_group('Utils') 295 | arg_utils.add_argument('--use-cuda', action='store_true', 296 | help='Flag specifying whether to use the GPU.') 297 | arg_utils.add_argument('--load', type=str, 298 | help='Path of the model to be loaded.') 299 | arg_utils.add_argument('--save', action='store_true', 300 | help='Flag specifying whether to save the model.') 301 | arg_utils.add_argument('--render', action='store_true', 302 | help='Flag specifying whether to render the game.') 303 | arg_utils.add_argument('--quiet', action='store_true', 304 | help='Flag specifying whether to hide the progress' 305 | 'bar.') 306 | arg_utils.add_argument('--debug', action='store_true', 307 | help='Flag specifying whether the script has to be' 308 | 'run in debug mode.') 309 | arg_utils.add_argument('--postfix', type=str, default='', 310 | help='Flag used to add a postfix to the folder name') 311 | 312 | args = parser.parse_args() 313 | 314 | folder_name = './logs/bullet_' + datetime.datetime.now().strftime( 315 | '%Y-%m-%d_%H-%M-%S') + args.postfix + '/' 316 | pathlib.Path(folder_name).mkdir(parents=True) 317 | with open(folder_name + 'args.pkl', 'wb') as f: 318 | pickle.dump(args, f) 319 | 320 | out = Parallel(n_jobs=4)(delayed(experiment)(i, args) 321 | for i in range(args.n_exp)) 322 | 323 | scores = np.array([o[0] for o in out]) 324 | critic_loss = np.array([o[1] for o in out]) 325 | 326 | np.save(folder_name + 'scores.npy', scores) 327 | np.save(folder_name + 'critic_loss.npy', critic_loss) 328 | -------------------------------------------------------------------------------- /ddpg/run_mujoco.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import pathlib 4 | import sys 5 | 6 | from joblib import delayed, Parallel 7 | import numpy as np 8 | import torch.optim as optim 9 | 10 | import pickle 11 | 12 | sys.path.append('..') 13 | 14 | from mushroom_rl.approximators.parametric import TorchApproximator 15 | from mushroom_rl.environments import * 16 | from mushroom_rl.utils.dataset import compute_J 17 | 18 | from core import Core 19 | from ddpg import DDPG 20 | from policy import OrnsteinUhlenbeckPolicy 21 | 22 | from networks import ActorNetwork, CriticNetwork 23 | from losses import LossFunction 24 | 25 | 26 | def print_epoch(epoch): 27 | print('################################################################') 28 | print('Epoch: ', epoch) 29 | print('----------------------------------------------------------------') 30 | 31 | 32 | def get_stats(dataset, gamma, idx, domains, tasks): 33 | J = np.mean(compute_J(dataset, gamma[idx])) 34 | print(domains[idx] + '-' + tasks[idx] + ': J: %f' % J) 35 | 36 | return J 37 | 38 | 39 | def experiment(idx, args): 40 | np.random.seed() 41 | 42 | args.games = [''.join(g) for g in args.games] 43 | 44 | domains = args.games[::2] 45 | tasks = args.games[1::2] 46 | 47 | scores = list() 48 | for _ in range(len(domains)): 49 | scores.append(list()) 50 | 51 | optimizer_actor = dict() 52 | optimizer_actor['class'] = optim.Adam 53 | optimizer_actor['params'] = dict(lr=args.learning_rate_actor) 54 | 55 | optimizer_critic = dict() 56 | optimizer_critic['class'] = optim.Adam 57 | optimizer_critic['params'] = dict(lr=args.learning_rate_critic, 58 | weight_decay=1e-2) 59 | 60 | # MDP 61 | mdp = list() 62 | gamma_eval = list() 63 | for i, g in enumerate(zip(domains, tasks)): 64 | mdp.append(DMControl(g[0], g[1], args.horizon[i], args.gamma[i])) 65 | gamma_eval.append(args.gamma[i]) 66 | if args.render: 67 | mdp[0].render() 68 | 69 | n_input_per_mdp = [m.info.observation_space.shape for m in mdp] 70 | n_actions_per_head = [(m.info.action_space.shape[0],) for m in mdp] 71 | 72 | max_obs_dim = 0 73 | max_act_n = 0 74 | for i in range(len(domains)): 75 | n = mdp[i].info.observation_space.shape[0] 76 | m = len(mdp[i].info.action_space.shape) 77 | if n > max_obs_dim: 78 | max_obs_dim = n 79 | max_obs_idx = i 80 | if m > max_act_n: 81 | max_act_n = m 82 | max_act_idx = i 83 | gammas = [m.info.gamma for m in mdp] 84 | horizons = [m.info.horizon for m in mdp] 85 | mdp_info = MDPInfo(mdp[max_obs_idx].info.observation_space, 86 | mdp[max_act_idx].info.action_space, gammas, horizons) 87 | max_action_value = list() 88 | for m in mdp: 89 | assert len(np.unique(m.info.action_space.low)) == 1 90 | assert len(np.unique(m.info.action_space.high)) == 1 91 | assert abs(m.info.action_space.low[0]) == m.info.action_space.high[0] 92 | 93 | max_action_value.append(m.info.action_space.high[0]) 94 | 95 | # DQN learning run 96 | 97 | # Settings 98 | if args.debug: 99 | initial_replay_size = args.batch_size 100 | max_replay_size = 500 101 | test_samples = 20 102 | evaluation_frequency = 50 103 | max_steps = 1000 104 | else: 105 | initial_replay_size = args.initial_replay_size 106 | max_replay_size = args.max_replay_size 107 | test_samples = args.test_samples 108 | evaluation_frequency = args.evaluation_frequency 109 | max_steps = args.max_steps 110 | 111 | # Policy 112 | policy_class = OrnsteinUhlenbeckPolicy 113 | policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2, 114 | n_actions_per_head=n_actions_per_head, 115 | max_action_value=max_action_value) 116 | 117 | # Approximator 118 | n_games = len(args.games) 119 | loss = LossFunction(n_games, args.batch_size, evaluation_frequency) 120 | 121 | actor_approximator = TorchApproximator 122 | actor_input_shape = [m.info.observation_space.shape for m in mdp] 123 | 124 | actor_approximator_params = dict( 125 | network=ActorNetwork, 126 | input_shape=actor_input_shape, 127 | output_shape=(max(n_actions_per_head)[0],), 128 | n_actions_per_head=n_actions_per_head, 129 | n_hidden_1=args.hidden_neurons[0], 130 | n_hidden_2=args.hidden_neurons[1], 131 | optimizer=optimizer_actor, 132 | use_cuda=args.use_cuda, 133 | features=args.features 134 | ) 135 | 136 | critic_approximator = TorchApproximator 137 | critic_input_shape = [m.info.observation_space.shape for m in mdp] 138 | critic_approximator_params = dict( 139 | network=CriticNetwork, 140 | input_shape=critic_input_shape, 141 | output_shape=(1,), 142 | n_actions_per_head=n_actions_per_head, 143 | n_hidden_1=args.hidden_neurons[0], 144 | n_hidden_2=args.hidden_neurons[1], 145 | optimizer=optimizer_actor, 146 | loss=loss, 147 | use_cuda=args.use_cuda, 148 | features=args.features 149 | ) 150 | 151 | # Agent 152 | algorithm_params = dict( 153 | batch_size=args.batch_size, 154 | initial_replay_size=initial_replay_size, 155 | max_replay_size=max_replay_size, 156 | tau=args.tau, 157 | actor_params=actor_approximator_params, 158 | critic_params=critic_approximator_params, 159 | policy_params=policy_params, 160 | n_games=len(domains), 161 | n_input_per_mdp=n_input_per_mdp, 162 | n_actions_per_head=n_actions_per_head, 163 | dtype=np.float32 164 | ) 165 | 166 | agent = DDPG(actor_approximator, critic_approximator, policy_class, 167 | mdp_info, **algorithm_params) 168 | 169 | # Algorithm 170 | core = Core(agent, mdp) 171 | 172 | # RUN 173 | 174 | # Fill replay memory with random dataset 175 | print_epoch(0) 176 | core.learn(n_steps=initial_replay_size, 177 | n_steps_per_fit=initial_replay_size, quiet=args.quiet) 178 | 179 | if args.transfer: 180 | weights = pickle.load(open(args.transfer, 'rb')) 181 | agent.set_shared_weights(weights) 182 | 183 | if args.load: 184 | weights = np.load(args.load) 185 | agent.policy.set_weights(weights) 186 | 187 | # Evaluate initial policy 188 | agent.policy.eval = True 189 | dataset = core.evaluate(n_steps=test_samples, render=args.render, 190 | quiet=args.quiet) 191 | agent.policy.eval = False 192 | for i in range(len(mdp)): 193 | d = dataset[i::len(mdp)] 194 | scores[i].append(get_stats(d, gamma_eval, i, domains, tasks)) 195 | 196 | if args.unfreeze_epoch > 0: 197 | agent.freeze_shared_weights() 198 | 199 | best_score_sum = -np.inf 200 | best_weights = None 201 | 202 | np.save(folder_name + 'scores-exp-%d.npy' % idx, scores) 203 | np.save(folder_name + 'critic_loss-exp-%d.npy' % idx, 204 | agent._critic_approximator.model._loss.get_losses()) 205 | for n_epoch in range(1, max_steps // evaluation_frequency + 1): 206 | if n_epoch >= args.unfreeze_epoch > 0: 207 | agent.unfreeze_shared_weights() 208 | 209 | print_epoch(n_epoch) 210 | print('- Learning:') 211 | # learning step 212 | core.learn(n_steps=evaluation_frequency, 213 | n_steps_per_fit=1, quiet=args.quiet) 214 | 215 | print('- Evaluation:') 216 | # evaluation step 217 | agent.policy.eval = True 218 | dataset = core.evaluate(n_steps=test_samples, 219 | render=args.render, quiet=args.quiet) 220 | agent.policy.eval = False 221 | 222 | current_score_sum = 0 223 | for i in range(len(mdp)): 224 | d = dataset[i::len(mdp)] 225 | current_score = get_stats(d, gamma_eval, i, domains, tasks) 226 | scores[i].append(current_score) 227 | current_score_sum += current_score 228 | 229 | # Save shared weights if best score 230 | if args.save_shared and current_score_sum >= best_score_sum: 231 | best_score_sum = current_score_sum 232 | best_weights = agent.get_shared_weights() 233 | 234 | if args.save: 235 | np.save(folder_name + 'best_weights-exp-%d.npy' % idx, 236 | agent.policy.get_weights()) 237 | 238 | np.save(folder_name + 'scores-exp-%d.npy' % idx, scores) 239 | np.save(folder_name + 'critic_loss-exp-%d.npy' % idx, 240 | agent._critic_approximator.model._loss.get_losses()) 241 | 242 | if args.save_shared: 243 | pickle.dump(best_weights, open(args.save_shared, 'wb')) 244 | 245 | return scores, agent._critic_approximator.model._loss.get_losses() 246 | 247 | 248 | if __name__ == '__main__': 249 | # Argument parser 250 | parser = argparse.ArgumentParser() 251 | 252 | arg_game = parser.add_argument_group('Game') 253 | arg_game.add_argument("--games", type=list, nargs='+', 254 | default=['cartpole', 'swingup']) 255 | arg_game.add_argument("--horizon", type=int, nargs='+') 256 | arg_game.add_argument("--gamma", type=float, nargs='+') 257 | arg_game.add_argument("--n-exp", type=int) 258 | 259 | arg_mem = parser.add_argument_group('Replay Memory') 260 | arg_mem.add_argument("--initial-replay-size", type=int, default=64, 261 | help='Initial size of the replay memory.') 262 | arg_mem.add_argument("--max-replay-size", type=int, default=50000, 263 | help='Max size of the replay memory.') 264 | 265 | arg_net = parser.add_argument_group('Deep Q-Network') 266 | arg_net.add_argument("--hidden-neurons", type=int, nargs=2, 267 | default=[600, 500]) 268 | arg_net.add_argument("--learning-rate-actor", type=float, default=1e-4, 269 | help='Learning rate value of the optimizer. Only used' 270 | 'in rmspropcentered') 271 | arg_net.add_argument("--learning-rate-critic", type=float, default=1e-3, 272 | help='Learning rate value of the optimizer. Only used' 273 | 'in rmspropcentered') 274 | 275 | arg_alg = parser.add_argument_group('Algorithm') 276 | arg_alg.add_argument("--features", choices=['relu', 'sigmoid']) 277 | arg_alg.add_argument("--batch-size", type=int, default=64, 278 | help='Batch size for each fit of the network.') 279 | arg_alg.add_argument("--tau", type=float, default=1e-3) 280 | arg_alg.add_argument("--history-length", type=int, default=1, 281 | help='Number of frames composing a state.') 282 | arg_alg.add_argument("--evaluation-frequency", type=int, default=10000, 283 | help='Number of learning step before each evaluation.' 284 | 'This number represents an epoch.') 285 | arg_alg.add_argument("--max-steps", type=int, default=1000000, 286 | help='Total number of learning steps.') 287 | arg_alg.add_argument("--test-samples", type=int, default=5000, 288 | help='Number of steps for each evaluation.') 289 | arg_alg.add_argument("--transfer", type=str, default='', 290 | help='Path to the file of the weights of the common ' 291 | 'layers to be loaded') 292 | arg_alg.add_argument("--save-shared", type=str, default='', 293 | help='filename where to save the shared weights') 294 | arg_alg.add_argument("--unfreeze-epoch", type=int, default=0, 295 | help="Number of epoch where to unfreeze shared weights.") 296 | 297 | arg_utils = parser.add_argument_group('Utils') 298 | arg_utils.add_argument('--use-cuda', action='store_true', 299 | help='Flag specifying whether to use the GPU.') 300 | arg_utils.add_argument('--load', type=str, 301 | help='Path of the model to be loaded.') 302 | arg_utils.add_argument('--save', action='store_true', 303 | help='Flag specifying whether to save the model.') 304 | arg_utils.add_argument('--render', action='store_true', 305 | help='Flag specifying whether to render the game.') 306 | arg_utils.add_argument('--quiet', action='store_true', 307 | help='Flag specifying whether to hide the progress' 308 | 'bar.') 309 | arg_utils.add_argument('--debug', action='store_true', 310 | help='Flag specifying whether the script has to be' 311 | 'run in debug mode.') 312 | arg_utils.add_argument('--postfix', type=str, default='', 313 | help='Flag used to add a postfix to the folder name') 314 | 315 | args = parser.parse_args() 316 | 317 | folder_name = './logs/mujoco_' + datetime.datetime.now().strftime( 318 | '%Y-%m-%d_%H-%M-%S') + args.postfix + '/' 319 | pathlib.Path(folder_name).mkdir(parents=True) 320 | 321 | with open(folder_name + 'args.pkl', 'wb') as f: 322 | pickle.dump(args, f) 323 | 324 | out = Parallel(n_jobs=-1)(delayed(experiment)(i, args) 325 | for i in range(args.n_exp)) 326 | 327 | scores = np.array([o[0] for o in out]) 328 | critic_loss = np.array([o[1] for o in out]) 329 | 330 | np.save(folder_name + 'scores.npy', scores) 331 | np.save(folder_name + 'critic_loss_raw.npy', critic_loss) 332 | -------------------------------------------------------------------------------- /dqn/dqn.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from mushroom_rl.core.agent import Agent 7 | from mushroom_rl.approximators.regressor import Regressor 8 | from mushroom_rl.approximators.parametric.torch_approximator import * 9 | 10 | from replay_memory import PrioritizedReplayMemory, ReplayMemory 11 | 12 | 13 | class DQN(Agent): 14 | """ 15 | Deep Q-Network algorithm. 16 | "Human-Level Control Through Deep Reinforcement Learning". 17 | Mnih V. et al.. 2015. 18 | 19 | """ 20 | def __init__(self, approximator, policy, mdp_info, batch_size, 21 | initial_replay_size, max_replay_size, n_actions_per_head, 22 | history_length=4, n_input_per_mdp=None, replay_memory=None, 23 | target_update_frequency=2500, fit_params=None, 24 | approximator_params=None, n_games=1, clip_reward=True, 25 | dtype=np.uint8): 26 | self._fit_params = dict() if fit_params is None else fit_params 27 | 28 | self._batch_size = batch_size 29 | self._n_games = n_games 30 | self._clip_reward = clip_reward 31 | if n_input_per_mdp is None: 32 | self._n_input_per_mdp = [mdp_info.observation_space.shape 33 | for _ in range(self._n_games)] 34 | else: 35 | self._n_input_per_mdp = n_input_per_mdp 36 | self._n_action_per_head = n_actions_per_head 37 | self._history_length = history_length 38 | self._max_actions = max(n_actions_per_head)[0] 39 | self._target_update_frequency = target_update_frequency 40 | 41 | if replay_memory is not None: 42 | self._replay_memory = replay_memory 43 | if isinstance(replay_memory[0], PrioritizedReplayMemory): 44 | self._fit = self._fit_prioritized 45 | else: 46 | self._fit = self._fit_standard 47 | else: 48 | self._replay_memory = [ReplayMemory( 49 | initial_replay_size, max_replay_size) for _ in range(self._n_games) 50 | ] 51 | self._fit = self._fit_standard 52 | 53 | self._n_updates = 0 54 | 55 | apprx_params_train = deepcopy(approximator_params) 56 | apprx_params_target = deepcopy(approximator_params) 57 | self.approximator = Regressor(approximator, **apprx_params_train) 58 | self.target_approximator = Regressor(approximator, 59 | **apprx_params_target) 60 | policy.set_q(self.approximator) 61 | 62 | self.target_approximator.model.set_weights( 63 | self.approximator.model.get_weights()) 64 | 65 | super().__init__(mdp_info, policy) 66 | 67 | n_samples = self._batch_size * self._n_games 68 | self._state_idxs = np.zeros(n_samples, dtype=np.int) 69 | self._state = np.zeros( 70 | ((n_samples, 71 | self._history_length) + self.mdp_info.observation_space.shape), 72 | dtype=dtype 73 | ).squeeze() 74 | self._action = np.zeros((n_samples, 1), dtype=np.int) 75 | self._reward = np.zeros(n_samples) 76 | self._next_state_idxs = np.zeros(n_samples, dtype=np.int) 77 | self._next_state = np.zeros( 78 | ((n_samples, 79 | self._history_length) + self.mdp_info.observation_space.shape), 80 | dtype=dtype 81 | ).squeeze() 82 | self._absorbing = np.zeros(n_samples) 83 | self._idxs = np.zeros(n_samples, dtype=np.int) 84 | self._is_weight = np.zeros(n_samples) 85 | 86 | def fit(self, dataset): 87 | self._fit(dataset) 88 | 89 | self._n_updates += 1 90 | if self._n_updates % self._target_update_frequency == 0: 91 | self._update_target() 92 | 93 | def _fit_standard(self, dataset): 94 | s = np.array([d[0][0] for d in dataset]).ravel() 95 | games = np.unique(s) 96 | for g in games: 97 | idxs = np.argwhere(s == g).ravel() 98 | d = list() 99 | for idx in idxs: 100 | d.append(dataset[idx]) 101 | 102 | self._replay_memory[g].add(d) 103 | 104 | fit_condition = np.all([rm.initialized for rm in self._replay_memory]) 105 | 106 | if fit_condition: 107 | for i in range(len(self._replay_memory)): 108 | game_state, game_action, game_reward, game_next_state,\ 109 | game_absorbing, _ = self._replay_memory[i].get( 110 | self._batch_size) 111 | 112 | start = self._batch_size * i 113 | stop = start + self._batch_size 114 | 115 | self._state_idxs[start:stop] = np.ones(self._batch_size) * i 116 | self._state[start:stop, :self._n_input_per_mdp[i][0]] = game_state 117 | self._action[start:stop] = game_action 118 | self._reward[start:stop] = game_reward 119 | self._next_state_idxs[start:stop] = np.ones(self._batch_size) * i 120 | self._next_state[start:stop, :self._n_input_per_mdp[i][0]] = game_next_state 121 | self._absorbing[start:stop] = game_absorbing 122 | 123 | if self._clip_reward: 124 | reward = np.clip(self._reward, -1, 1) 125 | else: 126 | reward = self._reward 127 | 128 | q_next = self._next_q() 129 | q = reward + q_next 130 | 131 | self.approximator.fit(self._state, self._action, q, 132 | idx=self._state_idxs, **self._fit_params) 133 | 134 | def _fit_prioritized(self, dataset): 135 | s = np.array([d[0][0] for d in dataset]).ravel() 136 | games = np.unique(s) 137 | for g in games: 138 | idxs = np.argwhere(s == g).ravel() 139 | d = list() 140 | for idx in idxs: 141 | d.append(dataset[idx]) 142 | 143 | self._replay_memory[g].add( 144 | d, np.ones(len(d)) * self._replay_memory[g].max_priority 145 | ) 146 | 147 | fit_condition = np.all([rm.initialized for rm in self._replay_memory]) 148 | 149 | if fit_condition: 150 | for i in range(len(self._replay_memory)): 151 | game_state, game_action, game_reward, game_next_state,\ 152 | game_absorbing, _, game_idxs, game_is_weight =\ 153 | self._replay_memory[i].get(self._batch_size) 154 | 155 | start = self._batch_size * i 156 | stop = start + self._batch_size 157 | 158 | self._state_idxs[start:stop] = np.ones(self._batch_size) * i 159 | self._state[start:stop, :self._n_input_per_mdp[i][0]] = game_state 160 | self._action[start:stop] = game_action 161 | self._reward[start:stop] = game_reward 162 | self._next_state_idxs[start:stop] = np.ones(self._batch_size) * i 163 | self._next_state[start:stop, :self._n_input_per_mdp[i][0]] = game_next_state 164 | self._absorbing[start:stop] = game_absorbing 165 | self._idxs[start:stop] = game_idxs 166 | self._is_weight[start:stop] = game_is_weight 167 | 168 | if self._clip_reward: 169 | reward = np.clip(self._reward, -1, 1) 170 | else: 171 | reward = self._reward 172 | 173 | q_next = self._next_q() 174 | q = reward + q_next 175 | q_current = self.approximator.predict(self._state, self._action, 176 | idx=self._state_idxs) 177 | td_error = q - q_current 178 | 179 | for er in self._replay_memory: 180 | er.update(td_error, self._idxs) 181 | 182 | self.approximator.fit(self._state, self._action, q, 183 | weights=self._is_weight, 184 | idx=self._state_idxs, 185 | **self._fit_params) 186 | 187 | def get_shared_weights(self): 188 | return self.approximator.model.network.get_shared_weights() 189 | 190 | def set_shared_weights(self, weights): 191 | self.approximator.model.network.set_shared_weights(weights) 192 | 193 | def freeze_shared_weights(self): 194 | self.approximator.model.network.freeze_shared_weights() 195 | 196 | def unfreeze_shared_weights(self): 197 | self.approximator.model.network.unfreeze_shared_weights() 198 | 199 | def _update_target(self): 200 | """ 201 | Update the target network. 202 | 203 | """ 204 | self.target_approximator.model.set_weights( 205 | self.approximator.model.get_weights()) 206 | 207 | def _next_q(self): 208 | q = self.target_approximator.predict(self._next_state, 209 | idx=self._next_state_idxs) 210 | 211 | out_q = np.zeros(self._batch_size * self._n_games) 212 | 213 | for i in range(self._n_games): 214 | start = self._batch_size * i 215 | stop = start + self._batch_size 216 | if np.any(self._absorbing[start:stop]): 217 | q[start:stop] *= 1 - self._absorbing[start:stop].reshape(-1, 1) 218 | 219 | n_actions = self._n_action_per_head[i][0] 220 | out_q[start:stop] = np.max(q[start:stop, :n_actions], axis=1) 221 | out_q[start:stop] *= self.mdp_info.gamma[i] 222 | 223 | return out_q 224 | 225 | 226 | class DoubleDQN(DQN): 227 | """ 228 | Double DQN algorithm. 229 | "Deep Reinforcement Learning with Double Q-Learning". 230 | Hasselt H. V. et al.. 2016. 231 | 232 | """ 233 | def _next_q(self): 234 | q = self.approximator.predict(self._next_state, 235 | idx=self._next_state_idxs) 236 | out_q = np.zeros(self._batch_size * self._n_games) 237 | 238 | for i in range(self._n_games): 239 | start = self._batch_size * i 240 | stop = start + self._batch_size 241 | n_actions = self._n_action_per_head[i][0] 242 | max_a = np.argmax(q[start:stop, :n_actions], axis=1) 243 | 244 | double_q = self.target_approximator.predict( 245 | self._next_state[start:stop], max_a, 246 | idx=self._next_state_idxs[start:stop] 247 | ) 248 | if np.any(self._absorbing[start:stop]): 249 | double_q *= 1 - self._absorbing[start:stop].reshape(-1, 1) 250 | 251 | out_q[start:stop] = double_q * self.mdp_info.gamma[i] 252 | 253 | return out_q 254 | -------------------------------------------------------------------------------- /dqn/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | class LossFunction(object): 6 | def __init__(self, n_games, batch_size, eval_frequency): 7 | self._n_games = n_games 8 | self._batch_size = batch_size 9 | self._eval_frequency = eval_frequency 10 | 11 | self._losses = list() 12 | self._reg_losses = list() 13 | self._counter = 0 14 | 15 | def get_losses(self): 16 | return self._losses 17 | 18 | def get_reg_losses(self): 19 | return self._reg_losses 20 | 21 | def __call__(self, yhat, y, reduction='mean'): 22 | loss = F.smooth_l1_loss(yhat, y, reduce=False) 23 | 24 | if self._need_log(): 25 | temp_losses = list() 26 | 27 | for i in range(self._n_games): 28 | start = i * self._batch_size 29 | stop = start + self._batch_size 30 | temp_losses.append(torch.mean(loss[start:stop]).item()) 31 | 32 | self._losses.append(temp_losses) 33 | 34 | if reduction is 'none': 35 | return loss 36 | elif reduction is 'mean': 37 | return loss.mean() 38 | else: 39 | raise NotImplementedError 40 | 41 | def _need_log(self): 42 | self._counter += 1 43 | if self._counter >= self._eval_frequency: 44 | self._counter = 0 45 | return True 46 | else: 47 | return False 48 | -------------------------------------------------------------------------------- /dqn/networks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | import numpy as np 6 | 7 | 8 | class AtariNetwork(nn.Module): 9 | n_features = 512 10 | 11 | def __init__(self, input_shape, _, n_actions_per_head, use_cuda, n_games, 12 | features, dropout): 13 | super().__init__() 14 | 15 | self._n_input = input_shape 16 | self._n_games = n_games 17 | self._max_actions = max(n_actions_per_head)[0] 18 | self._features = features 19 | self._use_cuda = use_cuda 20 | self._n_shared = 2 21 | 22 | self._h1 = nn.ModuleList( 23 | [nn.Conv2d(self._n_input[0], 32, kernel_size=8, stride=4) for _ in range( 24 | self._n_games)] 25 | ) 26 | self._h2 = nn.ModuleList( 27 | [nn.Conv2d(32, 64, kernel_size=4, stride=2) for _ in range( 28 | self._n_games)] 29 | ) 30 | self._h3 = nn.ModuleList( 31 | [nn.Conv2d(64, 64, kernel_size=3, stride=1) for _ in range( 32 | self._n_games)] 33 | ) 34 | self._h4 = nn.Linear(3136, self.n_features) 35 | self._h5 = nn.ModuleList( 36 | [nn.Linear(self.n_features, self._max_actions) for _ in range( 37 | self._n_games)] 38 | ) 39 | 40 | nn.init.xavier_uniform_(self._h4.weight, 41 | gain=nn.init.calculate_gain('relu')) 42 | for i in range(self._n_games): 43 | nn.init.xavier_uniform_(self._h1[i].weight, 44 | gain=nn.init.calculate_gain('relu')) 45 | nn.init.xavier_uniform_(self._h2[i].weight, 46 | gain=nn.init.calculate_gain('relu')) 47 | nn.init.xavier_uniform_(self._h3[i].weight, 48 | gain=nn.init.calculate_gain('relu')) 49 | nn.init.xavier_uniform_(self._h5[i].weight, 50 | gain=nn.init.calculate_gain('linear')) 51 | 52 | def forward(self, state, action=None, idx=None): 53 | state = state.float() / 255. 54 | 55 | h = list() 56 | for i in np.unique(idx): 57 | idxs = np.argwhere(idx == i).ravel() 58 | h_f = F.relu( 59 | self._h1[i](state[idxs, :self._n_input[0]]) 60 | ) 61 | h_f = F.relu(self._h2[i](h_f)) 62 | h.append(F.relu(self._h3[i](h_f))) 63 | cat_h3 = torch.cat(h) 64 | 65 | if self._features == 'relu': 66 | h_f = F.relu(self._h4(cat_h3.view(-1, 3136))) 67 | elif self._features == 'sigmoid': 68 | h_f = torch.sigmoid(self._h4(cat_h3.view(-1, 3136))) 69 | else: 70 | raise ValueError 71 | 72 | q = [self._h5[i](h_f) for i in range(self._n_games)] 73 | q = torch.stack(q, dim=1) 74 | 75 | if action is not None: 76 | action = action.long() 77 | q_acted = torch.squeeze( 78 | q.gather(2, action.repeat(1, self._n_games).unsqueeze(-1)), -1) 79 | 80 | q = q_acted 81 | 82 | if idx is not None: 83 | idx = torch.from_numpy(idx) 84 | if self._use_cuda: 85 | idx = idx.cuda() 86 | if q.dim() == 2: 87 | q_idx = q.gather(1, idx.unsqueeze(-1)) 88 | else: 89 | q_idx = q.gather(1, idx.view(-1, 1).repeat( 90 | 1, self._max_actions).unsqueeze(1)) 91 | 92 | q = torch.squeeze(q_idx, 1) 93 | 94 | return q 95 | 96 | def get_shared_weights(self): 97 | p1 = list() 98 | 99 | for p in self._h4.parameters(): 100 | p1.append(p.data.detach().cpu().numpy()) 101 | 102 | return p1 103 | 104 | def set_shared_weights(self, weights): 105 | w1 = weights 106 | 107 | for p, w in zip(self._h4.parameters(), w1): 108 | w_tensor = torch.from_numpy(w).type(p.data.dtype) 109 | if self._use_cuda: 110 | w_tensor = w_tensor.cuda() 111 | p.data = w_tensor 112 | 113 | def freeze_shared_weights(self): 114 | for p in self._h4.parameters(): 115 | p.requires_grad = False 116 | 117 | def unfreeze_shared_weights(self): 118 | for p in self._h4.parameters(): 119 | p.requires_grad = True 120 | 121 | 122 | class GymNetwork(nn.Module): 123 | def __init__(self, input_shape, _, n_actions_per_head, use_cuda, features, 124 | dropout, n_features=80): 125 | super().__init__() 126 | 127 | self._n_input = input_shape 128 | self._n_games = len(n_actions_per_head) 129 | self._max_actions = max(n_actions_per_head)[0] 130 | self._use_cuda = use_cuda 131 | self._n_shared = 4 132 | self._features = features 133 | 134 | self._h1 = nn.ModuleList( 135 | [nn.Linear(self._n_input[i][0], n_features) for i in range( 136 | len(input_shape))] 137 | ) 138 | self._h2 = nn.Linear(n_features, n_features) 139 | self._h3 = nn.Linear(n_features, n_features) 140 | self._h4 = nn.ModuleList( 141 | [nn.Linear(n_features, self._max_actions) for _ in range( 142 | self._n_games)] 143 | ) 144 | 145 | nn.init.xavier_uniform_(self._h2.weight, 146 | gain=nn.init.calculate_gain('relu')) 147 | nn.init.xavier_uniform_(self._h3.weight, 148 | gain=nn.init.calculate_gain('relu')) 149 | for i in range(self._n_games): 150 | nn.init.xavier_uniform_(self._h1[i].weight, 151 | gain=nn.init.calculate_gain('relu')) 152 | nn.init.xavier_uniform_(self._h4[i].weight, 153 | gain=nn.init.calculate_gain('linear')) 154 | 155 | def forward(self, state, action=None, idx=None): 156 | state = state.float() 157 | 158 | h1 = list() 159 | for i in np.unique(idx): 160 | idxs = np.argwhere(idx == i).ravel() 161 | h1.append(F.relu(self._h1[i](state[idxs, :self._n_input[i][0]]))) 162 | cat_h1 = torch.cat(h1) 163 | 164 | h_f = F.relu(self._h2(cat_h1)) 165 | 166 | if self._features == 'relu': 167 | h_f = F.relu(self._h3(h_f)) 168 | elif self._features == 'sigmoid': 169 | h_f = torch.sigmoid(self._h3(h_f)) 170 | else: 171 | raise ValueError 172 | 173 | q = [self._h4[i](h_f) for i in range(self._n_games)] 174 | q = torch.stack(q, dim=1) 175 | 176 | if action is not None: 177 | action = action.long() 178 | q_acted = torch.squeeze( 179 | q.gather(2, action.repeat(1, self._n_games).unsqueeze(-1)), -1) 180 | 181 | q = q_acted 182 | 183 | if idx is not None: 184 | idx = torch.from_numpy(idx) 185 | if self._use_cuda: 186 | idx = idx.cuda() 187 | if q.dim() == 2: 188 | q_idx = q.gather(1, idx.unsqueeze(-1)) 189 | else: 190 | q_idx = q.gather(1, idx.view(-1, 1).repeat( 191 | 1, self._max_actions).unsqueeze(1)) 192 | 193 | q = torch.squeeze(q_idx, 1) 194 | 195 | return q 196 | 197 | def get_shared_weights(self): 198 | p2 = list() 199 | p3 = list() 200 | 201 | for p in self._h2.parameters(): 202 | p2.append(p.data.detach().cpu().numpy()) 203 | 204 | for p in self._h3.parameters(): 205 | p3.append(p.data.detach().cpu().numpy()) 206 | 207 | return p2, p3 208 | 209 | def set_shared_weights(self, weights): 210 | w2, w3 = weights 211 | 212 | for p, w in zip(self._h2.parameters(), w2): 213 | w_tensor = torch.from_numpy(w).type(p.data.dtype) 214 | if self._use_cuda: 215 | w_tensor = w_tensor.cuda() 216 | p.data = w_tensor 217 | 218 | for p, w in zip(self._h3.parameters(), w3): 219 | w_tensor = torch.from_numpy(w).type(p.data.dtype) 220 | if self._use_cuda: 221 | w_tensor = w_tensor.cuda() 222 | p.data = w_tensor 223 | 224 | def freeze_shared_weights(self): 225 | for p in self._h2.parameters(): 226 | p.requires_grad = False 227 | for p in self._h3.parameters(): 228 | p.requires_grad = False 229 | 230 | def unfreeze_shared_weights(self): 231 | for p in self._h2.parameters(): 232 | p.requires_grad = True 233 | for p in self._h3.parameters(): 234 | p.requires_grad = True 235 | -------------------------------------------------------------------------------- /dqn/run_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import pathlib 4 | import sys 5 | 6 | from joblib import delayed, Parallel 7 | import numpy as np 8 | import torch.optim as optim 9 | 10 | import pickle 11 | 12 | sys.path.append('..') 13 | 14 | from mushroom_rl.approximators.parametric.torch_approximator import TorchApproximator 15 | from mushroom_rl.environments import * 16 | from mushroom_rl.utils.dataset import compute_metrics 17 | from mushroom_rl.utils.parameters import LinearParameter, Parameter 18 | 19 | from core import Core 20 | from dqn import DQN, DoubleDQN 21 | from policy import EpsGreedyMultiple 22 | from networks import AtariNetwork 23 | from losses import LossFunction 24 | from replay_memory import PrioritizedReplayMemory 25 | 26 | """ 27 | This script runs Atari experiments with DQN as presented in: 28 | "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. 29 | 30 | """ 31 | 32 | 33 | def print_epoch(epoch): 34 | print('################################################################') 35 | print('Epoch: ', epoch) 36 | print('----------------------------------------------------------------') 37 | 38 | 39 | def get_stats(dataset, idx, games): 40 | score = compute_metrics(dataset) 41 | print(games[idx] + ': min_reward: %f, max_reward: %f, mean_reward: %f,' 42 | ' games_completed: %d' % score) 43 | 44 | return score 45 | 46 | 47 | def experiment(args, idx): 48 | np.random.seed() 49 | 50 | args.games = [''.join(g) for g in args.games] 51 | 52 | # MDP 53 | mdp = list() 54 | for i, g in enumerate(args.games): 55 | mdp.append(Atari(g)) 56 | 57 | n_actions_per_head = [(m.info.action_space.n,) for m in mdp] 58 | 59 | max_obs_dim = 0 60 | max_act_n = 0 61 | for i in range(len(args.games)): 62 | n = mdp[i].info.observation_space.shape[0] 63 | m = mdp[i].info.action_space.n 64 | if n > max_obs_dim: 65 | max_obs_dim = n 66 | max_obs_idx = i 67 | if m > max_act_n: 68 | max_act_n = m 69 | max_act_idx = i 70 | gammas = [m.info.gamma for m in mdp] 71 | horizons = [m.info.horizon for m in mdp] 72 | mdp_info = MDPInfo(mdp[max_obs_idx].info.observation_space, 73 | mdp[max_act_idx].info.action_space, gammas, horizons) 74 | 75 | scores = list() 76 | for _ in range(len(args.games)): 77 | scores.append(list()) 78 | 79 | optimizer = dict() 80 | if args.optimizer == 'adam': 81 | optimizer['class'] = optim.Adam 82 | optimizer['params'] = dict(lr=args.learning_rate, 83 | eps=args.epsilon) 84 | elif args.optimizer == 'adadelta': 85 | optimizer['class'] = optim.Adadelta 86 | optimizer['params'] = dict(lr=args.learning_rate, 87 | eps=args.epsilon) 88 | elif args.optimizer == 'rmsprop': 89 | optimizer['class'] = optim.RMSprop 90 | optimizer['params'] = dict(lr=args.learning_rate, 91 | alpha=args.decay, 92 | eps=args.epsilon) 93 | elif args.optimizer == 'rmspropcentered': 94 | optimizer['class'] = optim.RMSprop 95 | optimizer['params'] = dict(lr=args.learning_rate, 96 | alpha=args.decay, 97 | eps=args.epsilon, 98 | centered=True) 99 | else: 100 | raise ValueError 101 | 102 | # DQN learning run 103 | 104 | # Settings 105 | if args.debug: 106 | initial_replay_size = args.batch_size 107 | max_replay_size = 500 108 | train_frequency = 5 109 | target_update_frequency = 10 110 | test_samples = 20 111 | evaluation_frequency = 50 112 | max_steps = 1000 113 | else: 114 | initial_replay_size = args.initial_replay_size 115 | max_replay_size = args.max_replay_size 116 | train_frequency = args.train_frequency 117 | target_update_frequency = args.target_update_frequency 118 | test_samples = args.test_samples 119 | evaluation_frequency = args.evaluation_frequency 120 | max_steps = args.max_steps 121 | 122 | # Policy 123 | epsilon = LinearParameter(value=args.initial_exploration_rate, 124 | threshold_value=args.final_exploration_rate, 125 | n=args.final_exploration_frame) 126 | epsilon_test = Parameter(value=args.test_exploration_rate) 127 | epsilon_random = Parameter(value=1) 128 | pi = EpsGreedyMultiple(parameter=epsilon, 129 | n_actions_per_head=n_actions_per_head) 130 | 131 | # Approximator 132 | n_games = len(args.games) 133 | loss = LossFunction(n_games, args.batch_size, 134 | args.evaluation_frequency) 135 | 136 | input_shape = (args.history_length, args.screen_height, 137 | args.screen_width) 138 | approximator_params = dict( 139 | network=AtariNetwork, 140 | input_shape=input_shape, 141 | output_shape=(max(n_actions_per_head)[0],), 142 | n_actions=max(n_actions_per_head)[0], 143 | n_actions_per_head=n_actions_per_head, 144 | n_games=len(args.games), 145 | optimizer=optimizer, 146 | loss=loss, 147 | use_cuda=args.use_cuda, 148 | features=args.features 149 | ) 150 | 151 | approximator = TorchApproximator 152 | 153 | if args.prioritized: 154 | replay_memory = [PrioritizedReplayMemory( 155 | initial_replay_size, max_replay_size, alpha=.6, 156 | beta=LinearParameter(.4, threshold_value=1, 157 | n=max_steps // train_frequency) 158 | ) for _ in range(n_games)] 159 | else: 160 | replay_memory = None 161 | 162 | # Agent 163 | algorithm_params = dict( 164 | batch_size=args.batch_size, 165 | n_games=len(args.games), 166 | initial_replay_size=initial_replay_size, 167 | max_replay_size=max_replay_size, 168 | target_update_frequency=target_update_frequency // train_frequency, 169 | replay_memory=replay_memory, 170 | n_actions_per_head=n_actions_per_head, 171 | clip_reward=True, 172 | history_length=args.history_length 173 | ) 174 | 175 | if args.algorithm == 'dqn': 176 | agent = DQN(approximator, pi, mdp_info, 177 | approximator_params=approximator_params, 178 | **algorithm_params) 179 | elif args.algorithm == 'ddqn': 180 | agent = DoubleDQN(approximator, pi, mdp_info, 181 | approximator_params=approximator_params, 182 | **algorithm_params) 183 | 184 | # Algorithm 185 | core = Core(agent, mdp) 186 | 187 | # RUN 188 | 189 | # Fill replay memory with random dataset 190 | print_epoch(0) 191 | pi.set_parameter(epsilon_random) 192 | core.learn(n_steps=initial_replay_size, 193 | n_steps_per_fit=initial_replay_size, quiet=args.quiet) 194 | 195 | if args.transfer: 196 | weights = pickle.load(open(args.transfer, 'rb')) 197 | agent.set_shared_weights(weights) 198 | 199 | if args.load: 200 | weights = np.load(args.load) 201 | agent.approximator.set_weights(weights) 202 | 203 | # Evaluate initial policy 204 | pi.set_parameter(epsilon_test) 205 | dataset = core.evaluate(n_steps=test_samples, render=args.render, 206 | quiet=args.quiet) 207 | for i in range(len(mdp)): 208 | d = dataset[i::len(mdp)] 209 | scores[i].append(get_stats(d, i, args.games)[2]) 210 | 211 | if args.unfreeze_epoch > 0: 212 | agent.freeze_shared_weights() 213 | 214 | best_score_sum = -np.inf 215 | best_weights = None 216 | 217 | np.save(folder_name + 'scores-exp-%d.npy' % idx, scores) 218 | np.save(folder_name + 'loss-exp-%d.npy' % idx, 219 | agent.approximator.model._loss.get_losses()) 220 | 221 | for n_epoch in range(1, max_steps // evaluation_frequency + 1): 222 | if n_epoch >= args.unfreeze_epoch > 0: 223 | agent.unfreeze_shared_weights() 224 | 225 | print_epoch(n_epoch) 226 | print('- Learning:') 227 | # learning step 228 | pi.set_parameter(None) 229 | core.learn(n_steps=evaluation_frequency, 230 | n_steps_per_fit=train_frequency, quiet=args.quiet) 231 | 232 | print('- Evaluation:') 233 | # evaluation step 234 | pi.set_parameter(epsilon_test) 235 | dataset = core.evaluate(n_steps=test_samples, 236 | render=args.render, quiet=args.quiet) 237 | 238 | current_score_sum = 0 239 | for i in range(len(mdp)): 240 | d = dataset[i::len(mdp)] 241 | current_score = get_stats(d, i, args.games)[2] 242 | scores[i].append(current_score) 243 | current_score_sum += current_score 244 | 245 | # Save shared weights if best score 246 | if args.save_shared and current_score_sum >= best_score_sum: 247 | best_score_sum = current_score_sum 248 | best_weights = agent.get_shared_weights() 249 | 250 | if args.save: 251 | np.save(folder_name + 'weights-exp-%d-%d.npy' % (idx, n_epoch), 252 | agent.approximator.get_weights()) 253 | 254 | np.save(folder_name + 'scores-exp-%d.npy' % idx, scores) 255 | np.save(folder_name + 'loss-exp-%d.npy' % idx, 256 | agent.approximator.model._loss.get_losses()) 257 | 258 | if args.save_shared: 259 | pickle.dump(best_weights, open(args.save_shared, 'wb')) 260 | 261 | return scores, agent.approximator.model._loss.get_losses() 262 | 263 | 264 | if __name__ == '__main__': 265 | # Argument parser 266 | parser = argparse.ArgumentParser() 267 | 268 | arg_game = parser.add_argument_group('Game') 269 | arg_game.add_argument("--games", 270 | type=list, 271 | nargs='+', 272 | default=['BreakoutNoFrameskip-v4'], 273 | help='Gym ID of the problem.') 274 | arg_game.add_argument("--screen-width", type=int, default=84, 275 | help='Width of the game screen.') 276 | arg_game.add_argument("--screen-height", type=int, default=84, 277 | help='Height of the game screen.') 278 | arg_game.add_argument("--n-exp", type=int) 279 | 280 | arg_mem = parser.add_argument_group('Replay Memory') 281 | arg_mem.add_argument("--initial-replay-size", type=int, default=50000, 282 | help='Initial size of the replay memory.') 283 | arg_mem.add_argument("--max-replay-size", type=int, default=500000, 284 | help='Max size of the replay memory.') 285 | arg_mem.add_argument("--prioritized", action='store_true', 286 | help='Whether to use prioritized memory or not.') 287 | 288 | arg_net = parser.add_argument_group('Deep Q-Network') 289 | arg_net.add_argument("--optimizer", 290 | choices=['adadelta', 291 | 'adam', 292 | 'rmsprop', 293 | 'rmspropcentered'], 294 | default='adam', 295 | help='Name of the optimizer to use to learn.') 296 | arg_net.add_argument("--learning-rate", type=float, default=.00025, 297 | help='Learning rate value of the optimizer. Only used' 298 | 'in rmspropcentered') 299 | arg_net.add_argument("--decay", type=float, default=.95, 300 | help='Discount factor for the history coming from the' 301 | 'gradient momentum in rmspropcentered') 302 | arg_net.add_argument("--epsilon", type=float, default=1e-8, 303 | help='Epsilon term used in rmspropcentered') 304 | 305 | arg_alg = parser.add_argument_group('Algorithm') 306 | arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn'], 307 | default='dqn', 308 | help='Name of the algorithm. dqn is for standard' 309 | 'DQN, ddqn is for Double DQN and adqn is for' 310 | 'Averaged DQN.') 311 | arg_alg.add_argument("--features", choices=['relu', 'sigmoid']) 312 | arg_alg.add_argument("--batch-size", type=int, default=32, 313 | help='Batch size for each fit of the network.') 314 | arg_alg.add_argument("--history-length", type=int, default=4, 315 | help='Number of frames composing a state.') 316 | arg_alg.add_argument("--target-update-frequency", type=int, default=10000, 317 | help='Number of collected samples before each update' 318 | 'of the target network.') 319 | arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, 320 | help='Number of learning step before each evaluation.' 321 | 'This number represents an epoch.') 322 | arg_alg.add_argument("--train-frequency", type=int, default=4, 323 | help='Number of learning steps before each fit of the' 324 | 'neural network.') 325 | arg_alg.add_argument("--max-steps", type=int, default=50000000, 326 | help='Total number of learning steps.') 327 | arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000, 328 | help='Number of steps until the exploration rate stops' 329 | 'decreasing.') 330 | arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., 331 | help='Initial value of the exploration rate.') 332 | arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, 333 | help='Final value of the exploration rate. When it' 334 | 'reaches this values, it stays constant.') 335 | arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, 336 | help='Exploration rate used during evaluation.') 337 | arg_alg.add_argument("--test-samples", type=int, default=125000, 338 | help='Number of steps for each evaluation.') 339 | arg_alg.add_argument("--max-no-op-actions", type=int, default=30, 340 | help='Maximum number of no-op action performed at the' 341 | 'beginning of the episodes. The minimum number is' 342 | 'history_length.') 343 | arg_alg.add_argument("--transfer", type=str, default='', 344 | help='Path to the file of the weights of the common ' 345 | 'layers to be loaded') 346 | arg_alg.add_argument("--save-shared", type=str, default='', 347 | help='filename where to save the shared weights') 348 | arg_alg.add_argument("--unfreeze-epoch", type=int, default=0, 349 | help="Number of epoch where to unfreeze shared weights.") 350 | 351 | arg_utils = parser.add_argument_group('Utils') 352 | arg_utils.add_argument('--use-cuda', action='store_true', 353 | help='Flag specifying whether to use the GPU.') 354 | arg_utils.add_argument('--load', type=str, 355 | help='Path of the model to be loaded.') 356 | arg_utils.add_argument('--save', action='store_true', 357 | help='Flag specifying whether to save the model.') 358 | arg_utils.add_argument('--render', action='store_true', 359 | help='Flag specifying whether to render the game.') 360 | arg_utils.add_argument('--quiet', action='store_true', 361 | help='Flag specifying whether to hide the progress' 362 | 'bar.') 363 | arg_utils.add_argument('--debug', action='store_true', 364 | help='Flag specifying whether the script has to be' 365 | 'run in debug mode.') 366 | arg_utils.add_argument('--postfix', type=str, default='', 367 | help='Flag used to add a postfix to the folder name') 368 | 369 | args = parser.parse_args() 370 | 371 | folder_name = './logs/gym_' + datetime.datetime.now().strftime( 372 | '%Y-%m-%d_%H-%M-%S') + args.postfix + '/' 373 | pathlib.Path(folder_name).mkdir(parents=True) 374 | with open(folder_name + 'args.pkl', 'wb') as f: 375 | pickle.dump(args, f) 376 | 377 | out = Parallel(n_jobs=-1)(delayed(experiment)(args, i) 378 | for i in range(args.n_exp)) 379 | 380 | scores = np.array([o[0] for o in out]) 381 | loss = np.array([o[1] for o in out]) 382 | 383 | np.save(folder_name + 'scores.npy', scores) 384 | np.save(folder_name + 'loss.npy', loss) 385 | -------------------------------------------------------------------------------- /dqn/run_gym.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import pathlib 4 | import sys 5 | 6 | from joblib import delayed, Parallel 7 | import numpy as np 8 | import torch.optim as optim 9 | 10 | import pickle 11 | 12 | sys.path.append('..') 13 | 14 | from mushroom_rl.approximators.parametric.torch_approximator import TorchApproximator 15 | from mushroom_rl.core.environment import MDPInfo 16 | from mushroom_rl.environments import * 17 | from mushroom_rl.utils.dataset import compute_J 18 | from mushroom_rl.utils.parameters import LinearParameter, Parameter 19 | 20 | from core import Core 21 | from dqn import DQN, DoubleDQN 22 | from policy import EpsGreedyMultiple 23 | from networks import GymNetwork 24 | from losses import LossFunction 25 | from replay_memory import PrioritizedReplayMemory 26 | 27 | """ 28 | This script runs Atari experiments with DQN as presented in: 29 | "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. 30 | 31 | """ 32 | 33 | 34 | def print_epoch(epoch): 35 | print('################################################################') 36 | print('Epoch: ', epoch) 37 | print('----------------------------------------------------------------') 38 | 39 | 40 | def get_stats(dataset, gamma, idx, games): 41 | J = np.mean(compute_J(dataset, gamma[idx])) 42 | print(games[idx] + ': J: %f' % J) 43 | 44 | return J 45 | 46 | 47 | def experiment(args, idx): 48 | np.random.seed() 49 | 50 | args.games = [''.join(g) for g in args.games] 51 | 52 | # MDP 53 | mdp = list() 54 | gamma_eval = list() 55 | for i, g in enumerate(args.games): 56 | if g == 'pendulum': 57 | mdp.append(CartPole(horizon=args.horizon[i], gamma=args.gamma[i])) 58 | elif g == 'caronhill': 59 | mdp.append(CarOnHill(horizon=args.horizon[i], gamma=args.gamma[i])) 60 | else: 61 | mdp.append(Gym(g, args.horizon[i], args.gamma[i])) 62 | 63 | gamma_eval.append(args.gamma[i]) 64 | 65 | n_input_per_mdp = [m.info.observation_space.shape for m in mdp] 66 | n_actions_per_head = [(m.info.action_space.n,) for m in mdp] 67 | 68 | max_obs_dim = 0 69 | max_act_n = 0 70 | for i in range(len(args.games)): 71 | n = mdp[i].info.observation_space.shape[0] 72 | m = mdp[i].info.action_space.n 73 | if n > max_obs_dim: 74 | max_obs_dim = n 75 | max_obs_idx = i 76 | if m > max_act_n: 77 | max_act_n = m 78 | max_act_idx = i 79 | gammas = [m.info.gamma for m in mdp] 80 | horizons = [m.info.horizon for m in mdp] 81 | mdp_info = MDPInfo(mdp[max_obs_idx].info.observation_space, 82 | mdp[max_act_idx].info.action_space, gammas, horizons) 83 | 84 | scores = list() 85 | for _ in range(len(args.games)): 86 | scores.append(list()) 87 | 88 | optimizer = dict() 89 | if args.optimizer == 'adam': 90 | optimizer['class'] = optim.Adam 91 | optimizer['params'] = dict(lr=args.learning_rate, 92 | eps=args.epsilon) 93 | elif args.optimizer == 'adadelta': 94 | optimizer['class'] = optim.Adadelta 95 | optimizer['params'] = dict(lr=args.learning_rate, 96 | eps=args.epsilon) 97 | elif args.optimizer == 'rmsprop': 98 | optimizer['class'] = optim.RMSprop 99 | optimizer['params'] = dict(lr=args.learning_rate, 100 | alpha=args.decay, 101 | eps=args.epsilon) 102 | elif args.optimizer == 'rmspropcentered': 103 | optimizer['class'] = optim.RMSprop 104 | optimizer['params'] = dict(lr=args.learning_rate, 105 | alpha=args.decay, 106 | eps=args.epsilon, 107 | centered=True) 108 | else: 109 | raise ValueError 110 | 111 | # DQN learning run 112 | 113 | # Settings 114 | if args.debug: 115 | initial_replay_size = args.batch_size 116 | max_replay_size = 500 117 | train_frequency = 5 118 | target_update_frequency = 10 119 | test_samples = 20 120 | evaluation_frequency = 50 121 | max_steps = 1000 122 | else: 123 | initial_replay_size = args.initial_replay_size 124 | max_replay_size = args.max_replay_size 125 | train_frequency = args.train_frequency 126 | target_update_frequency = args.target_update_frequency 127 | test_samples = args.test_samples 128 | evaluation_frequency = args.evaluation_frequency 129 | max_steps = args.max_steps 130 | 131 | # Policy 132 | epsilon = LinearParameter(value=args.initial_exploration_rate, 133 | threshold_value=args.final_exploration_rate, 134 | n=args.final_exploration_frame) 135 | epsilon_test = Parameter(value=args.test_exploration_rate) 136 | epsilon_random = Parameter(value=1) 137 | pi = EpsGreedyMultiple(parameter=epsilon, 138 | n_actions_per_head=n_actions_per_head) 139 | 140 | # Approximator 141 | input_shape = [m.info.observation_space.shape for m in mdp] 142 | n_games = len(args.games) 143 | loss = LossFunction(n_games, args.batch_size, 144 | args.evaluation_frequency) 145 | 146 | approximator_params = dict( 147 | network=GymNetwork, 148 | input_shape=input_shape, 149 | output_shape=(max(n_actions_per_head)[0],), 150 | n_actions=max(n_actions_per_head)[0], 151 | n_actions_per_head=n_actions_per_head, 152 | optimizer=optimizer, 153 | loss=loss, 154 | use_cuda=args.use_cuda, 155 | features=args.features 156 | ) 157 | 158 | approximator = TorchApproximator 159 | 160 | if args.prioritized: 161 | replay_memory = [PrioritizedReplayMemory( 162 | initial_replay_size, max_replay_size, alpha=.6, 163 | beta=LinearParameter(.4, threshold_value=1, 164 | n=max_steps // train_frequency) 165 | ) for _ in range(n_games)] 166 | else: 167 | replay_memory = None 168 | 169 | # Agent 170 | algorithm_params = dict( 171 | batch_size=args.batch_size, 172 | n_games=len(args.games), 173 | initial_replay_size=initial_replay_size, 174 | max_replay_size=max_replay_size, 175 | target_update_frequency=target_update_frequency // train_frequency, 176 | replay_memory=replay_memory, 177 | n_input_per_mdp=n_input_per_mdp, 178 | n_actions_per_head=n_actions_per_head, 179 | clip_reward=False, 180 | history_length=args.history_length, 181 | dtype=np.float32 182 | ) 183 | 184 | if args.algorithm == 'dqn': 185 | agent = DQN(approximator, pi, mdp_info, 186 | approximator_params=approximator_params, 187 | **algorithm_params) 188 | elif args.algorithm == 'ddqn': 189 | agent = DoubleDQN(approximator, pi, mdp_info, 190 | approximator_params=approximator_params, 191 | **algorithm_params) 192 | 193 | # Algorithm 194 | core = Core(agent, mdp) 195 | 196 | # RUN 197 | 198 | # Fill replay memory with random dataset 199 | print_epoch(0) 200 | pi.set_parameter(epsilon_random) 201 | core.learn(n_steps=initial_replay_size, 202 | n_steps_per_fit=initial_replay_size, quiet=args.quiet) 203 | 204 | if args.transfer: 205 | weights = pickle.load(open(args.transfer, 'rb')) 206 | agent.set_shared_weights(weights) 207 | 208 | if args.load: 209 | weights = np.load(args.load) 210 | agent.approximator.set_weights(weights) 211 | 212 | # Evaluate initial policy 213 | pi.set_parameter(epsilon_test) 214 | dataset = core.evaluate(n_steps=test_samples, render=args.render, 215 | quiet=args.quiet) 216 | for i in range(len(mdp)): 217 | d = dataset[i::len(mdp)] 218 | scores[i].append(get_stats(d, gamma_eval, i, args.games)) 219 | 220 | if args.unfreeze_epoch > 0: 221 | agent.freeze_shared_weights() 222 | 223 | best_score_sum = -np.inf 224 | best_weights = None 225 | 226 | np.save(folder_name + 'scores-exp-%d.npy' % idx, scores) 227 | np.save(folder_name + 'loss-exp-%d.npy' % idx, 228 | agent.approximator.model._loss.get_losses()) 229 | 230 | for n_epoch in range(1, max_steps // evaluation_frequency + 1): 231 | if n_epoch >= args.unfreeze_epoch > 0: 232 | agent.unfreeze_shared_weights() 233 | 234 | print_epoch(n_epoch) 235 | print('- Learning:') 236 | # learning step 237 | pi.set_parameter(None) 238 | core.learn(n_steps=evaluation_frequency, 239 | n_steps_per_fit=train_frequency, quiet=args.quiet) 240 | 241 | print('- Evaluation:') 242 | # evaluation step 243 | pi.set_parameter(epsilon_test) 244 | dataset = core.evaluate(n_steps=test_samples, 245 | render=args.render, quiet=args.quiet) 246 | 247 | current_score_sum = 0 248 | for i in range(len(mdp)): 249 | d = dataset[i::len(mdp)] 250 | current_score = get_stats(d, gamma_eval, i, args.games) 251 | scores[i].append(current_score) 252 | current_score_sum += current_score 253 | 254 | # Save shared weights if best score 255 | if args.save_shared and current_score_sum >= best_score_sum: 256 | best_score_sum = current_score_sum 257 | best_weights = agent.get_shared_weights() 258 | 259 | if args.save: 260 | np.save(folder_name + 'weights-exp-%d-%d.npy' % (idx, n_epoch), 261 | agent.approximator.get_weights()) 262 | 263 | np.save(folder_name + 'scores-exp-%d.npy' % idx, scores) 264 | np.save(folder_name + 'loss-exp-%d.npy' % idx, 265 | agent.approximator.model._loss.get_losses()) 266 | 267 | if args.save_shared: 268 | pickle.dump(best_weights, open(args.save_shared, 'wb')) 269 | 270 | return scores, agent.approximator.model._loss.get_losses() 271 | 272 | 273 | if __name__ == '__main__': 274 | # Argument parser 275 | parser = argparse.ArgumentParser() 276 | 277 | arg_game = parser.add_argument_group('Game') 278 | arg_game.add_argument("--games", 279 | type=list, 280 | nargs='+', 281 | default=['Acrobot-v1'], 282 | help='Gym ID of the problem.') 283 | arg_game.add_argument("--horizon", type=int, nargs='+') 284 | arg_game.add_argument("--gamma", type=float, nargs='+') 285 | arg_game.add_argument("--n-exp", type=int) 286 | 287 | arg_mem = parser.add_argument_group('Replay Memory') 288 | arg_mem.add_argument("--initial-replay-size", type=int, default=100, 289 | help='Initial size of the replay memory.') 290 | arg_mem.add_argument("--max-replay-size", type=int, default=5000, 291 | help='Max size of the replay memory.') 292 | arg_mem.add_argument("--prioritized", action='store_true', 293 | help='Whether to use prioritized memory or not.') 294 | 295 | arg_net = parser.add_argument_group('Deep Q-Network') 296 | arg_net.add_argument("--optimizer", 297 | choices=['adadelta', 298 | 'adam', 299 | 'rmsprop', 300 | 'rmspropcentered'], 301 | default='adam', 302 | help='Name of the optimizer to use to learn.') 303 | arg_net.add_argument("--learning-rate", type=float, default=.001, 304 | help='Learning rate value of the optimizer. Only used' 305 | 'in rmspropcentered') 306 | arg_net.add_argument("--decay", type=float, default=.95, 307 | help='Discount factor for the history coming from the' 308 | 'gradient momentum in rmspropcentered') 309 | arg_net.add_argument("--epsilon", type=float, default=1e-8, 310 | help='Epsilon term used in rmspropcentered') 311 | 312 | arg_alg = parser.add_argument_group('Algorithm') 313 | arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn'], 314 | default='dqn', 315 | help='Name of the algorithm. dqn is for standard' 316 | 'DQN, ddqn is for Double DQN and adqn is for' 317 | 'Averaged DQN.') 318 | arg_alg.add_argument("--features", choices=['relu', 'sigmoid']) 319 | arg_alg.add_argument("--batch-size", type=int, default=100, 320 | help='Batch size for each fit of the network.') 321 | arg_alg.add_argument("--history-length", type=int, default=1, 322 | help='Number of frames composing a state.') 323 | arg_alg.add_argument("--target-update-frequency", type=int, default=100, 324 | help='Number of collected samples before each update' 325 | 'of the target network.') 326 | arg_alg.add_argument("--evaluation-frequency", type=int, default=1000, 327 | help='Number of learning step before each evaluation.' 328 | 'This number represents an epoch.') 329 | arg_alg.add_argument("--train-frequency", type=int, default=1, 330 | help='Number of learning steps before each fit of the' 331 | 'neural network.') 332 | arg_alg.add_argument("--max-steps", type=int, default=50000, 333 | help='Total number of learning steps.') 334 | arg_alg.add_argument("--final-exploration-frame", type=int, default=5000, 335 | help='Number of steps until the exploration rate stops' 336 | 'decreasing.') 337 | arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., 338 | help='Initial value of the exploration rate.') 339 | arg_alg.add_argument("--final-exploration-rate", type=float, default=.01, 340 | help='Final value of the exploration rate. When it' 341 | 'reaches this values, it stays constant.') 342 | arg_alg.add_argument("--test-exploration-rate", type=float, default=0., 343 | help='Exploration rate used during evaluation.') 344 | arg_alg.add_argument("--test-samples", type=int, default=2000, 345 | help='Number of steps for each evaluation.') 346 | arg_alg.add_argument("--max-no-op-actions", type=int, default=0, 347 | help='Maximum number of no-op action performed at the' 348 | 'beginning of the episodes. The minimum number is' 349 | 'history_length.') 350 | arg_alg.add_argument("--transfer", type=str, default='', 351 | help='Path to the file of the weights of the common ' 352 | 'layers to be loaded') 353 | arg_alg.add_argument("--save-shared", type=str, default='', 354 | help='filename where to save the shared weights') 355 | arg_alg.add_argument("--unfreeze-epoch", type=int, default=0, 356 | help="Number of epoch where to unfreeze shared weights.") 357 | 358 | arg_utils = parser.add_argument_group('Utils') 359 | arg_utils.add_argument('--use-cuda', action='store_true', 360 | help='Flag specifying whether to use the GPU.') 361 | arg_utils.add_argument('--load', type=str, 362 | help='Path of the model to be loaded.') 363 | arg_utils.add_argument('--save', action='store_true', 364 | help='Flag specifying whether to save the model.') 365 | arg_utils.add_argument('--render', action='store_true', 366 | help='Flag specifying whether to render the game.') 367 | arg_utils.add_argument('--quiet', action='store_true', 368 | help='Flag specifying whether to hide the progress' 369 | 'bar.') 370 | arg_utils.add_argument('--debug', action='store_true', 371 | help='Flag specifying whether the script has to be' 372 | 'run in debug mode.') 373 | arg_utils.add_argument('--postfix', type=str, default='', 374 | help='Flag used to add a postfix to the folder name') 375 | 376 | args = parser.parse_args() 377 | 378 | folder_name = './logs/gym_' + datetime.datetime.now().strftime( 379 | '%Y-%m-%d_%H-%M-%S') + args.postfix + '/' 380 | pathlib.Path(folder_name).mkdir(parents=True) 381 | with open(folder_name + 'args.pkl', 'wb') as f: 382 | pickle.dump(args, f) 383 | 384 | out = Parallel(n_jobs=-1)(delayed(experiment)(args, i) 385 | for i in range(args.n_exp)) 386 | 387 | scores = np.array([o[0] for o in out]) 388 | loss = np.array([o[1] for o in out]) 389 | 390 | np.save(folder_name + 'scores.npy', scores) 391 | np.save(folder_name + 'loss.npy', loss) 392 | -------------------------------------------------------------------------------- /fqi/car_on_hill.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.integrate import odeint 3 | 4 | from mushroom_rl.environments import Environment, MDPInfo 5 | from mushroom_rl.utils import spaces 6 | 7 | 8 | class CarOnHill(Environment): 9 | """ 10 | The Car On Hill environment as presented in: 11 | "Tree-Based Batch Mode Reinforcement Learning". Ernst D. et al.. 2005. 12 | 13 | """ 14 | def __init__(self, m, g, a, horizon=100, gamma=.95): 15 | """ 16 | Constructor. 17 | 18 | """ 19 | # MDP parameters 20 | self.max_pos = 1. 21 | self.max_velocity = 3. 22 | high = np.array([self.max_pos, self.max_velocity]) 23 | self._g = g 24 | self._m = m 25 | self._dt = .1 26 | self._discrete_actions = [-a, a] 27 | 28 | # MDP properties 29 | observation_space = spaces.Box(low=-high, high=high) 30 | action_space = spaces.Discrete(2) 31 | mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) 32 | 33 | super().__init__(mdp_info) 34 | 35 | def reset(self, state=None): 36 | if state is None: 37 | self._state = np.array([-0.5, 0]) 38 | else: 39 | self._state = state 40 | 41 | return self._state 42 | 43 | def step(self, action): 44 | action = self._discrete_actions[action[0]] 45 | sa = np.append(self._state, action) 46 | new_state = odeint(self._dpds, sa, [0, self._dt]) 47 | 48 | self._state = new_state[-1, :-1] 49 | 50 | if self._state[0] < -self.max_pos or \ 51 | np.abs(self._state[1]) > self.max_velocity: 52 | reward = -1 53 | absorbing = True 54 | elif self._state[0] > self.max_pos and \ 55 | np.abs(self._state[1]) <= self.max_velocity: 56 | reward = 1 57 | absorbing = True 58 | else: 59 | reward = 0 60 | absorbing = False 61 | 62 | return self._state, reward, absorbing, {} 63 | 64 | def _dpds(self, state_action, t): 65 | position = state_action[0] 66 | velocity = state_action[1] 67 | u = state_action[-1] 68 | 69 | if position < 0.: 70 | diff_hill = 2 * position + 1 71 | diff_2_hill = 2 72 | else: 73 | diff_hill = 1 / ((1 + 5 * position ** 2) ** 1.5) 74 | diff_2_hill = (-15 * position) / ((1 + 5 * position ** 2) ** 2.5) 75 | 76 | dp = velocity 77 | ds = (u - self._g * self._m * diff_hill - velocity ** 2 * self._m * 78 | diff_hill * diff_2_hill) / (self._m * (1 + diff_hill ** 2)) 79 | 80 | return dp, ds, 0. 81 | -------------------------------------------------------------------------------- /fqi/dataset_0.800_4.000.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_0.800_4.000.pkl -------------------------------------------------------------------------------- /fqi/dataset_0.850_4.000.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_0.850_4.000.pkl -------------------------------------------------------------------------------- /fqi/dataset_0.900_4.000.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_0.900_4.000.pkl -------------------------------------------------------------------------------- /fqi/dataset_0.950_4.000.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_0.950_4.000.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.000_4.000.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.000.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.000_4.125.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.125.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.000_4.250.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.250.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.000_4.375.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.375.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.000_4.500.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.000_4.500.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.050_4.500.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.050_4.500.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.100_4.500.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.100_4.500.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.150_4.500.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.150_4.500.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.200_4.375.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.200_4.375.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.200_4.500.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.200_4.500.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.200_4.625.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.200_4.625.pkl -------------------------------------------------------------------------------- /fqi/dataset_1.200_4.750.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/dataset_1.200_4.750.pkl -------------------------------------------------------------------------------- /fqi/fqi.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from mushroom.algorithms.value.batch_td import BatchTD 4 | 5 | 6 | 7 | class FQI(BatchTD): 8 | """ 9 | Fitted Q-Iteration algorithm. 10 | "Tree-Based Batch Mode Reinforcement Learning", Ernst D. et al.. 2005. 11 | 12 | """ 13 | def __init__(self, approximator, policy, mdp_info, n_iterations, 14 | n_actions_per_head, fit_params=None, 15 | approximator_params=None, quiet=False): 16 | """ 17 | Constructor. 18 | 19 | Args: 20 | n_iterations (int): number of iterations to perform for training; 21 | quiet (bool, False): whether to show the progress bar or not. 22 | 23 | """ 24 | self._n_iterations = n_iterations 25 | self._n_actions_per_head = n_actions_per_head 26 | self._n_games = len(self._n_actions_per_head) 27 | self._quiet = quiet 28 | 29 | super().__init__(mdp_info, policy, approximator, approximator_params, 30 | fit_params) 31 | 32 | self._target = None 33 | 34 | def _fit(self, state, action, reward, next_state, absorbing, idxs): 35 | """ 36 | Single fit iteration. 37 | 38 | Args: 39 | x (list): the dataset. 40 | 41 | """ 42 | if self._target is None: 43 | self._target = reward.copy() 44 | else: 45 | q = self.approximator.predict(next_state, idx=idxs) 46 | if np.any(absorbing): 47 | q *= 1 - absorbing.reshape(-1, 1) 48 | 49 | max_q = np.max(q, axis=1) 50 | self._target = reward + self.mdp_info.gamma * max_q 51 | 52 | self.approximator.fit(state, action, self._target, idx=idxs, 53 | **self._fit_params) 54 | -------------------------------------------------------------------------------- /fqi/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | class LossFunction(object): 6 | def __init__(self, n_games): 7 | self._n_games = n_games 8 | 9 | self._losses = list() 10 | self._counter = 0 11 | 12 | def get_losses(self): 13 | return self._losses 14 | 15 | def __call__(self, yhat, y): 16 | loss = F.mse_loss(yhat, y, reduce=True) 17 | 18 | return loss 19 | 20 | def _need_log(self): 21 | self._counter += 1 22 | if self._counter >= self._eval_frequency: 23 | self._counter = 0 24 | return True 25 | else: 26 | return False 27 | -------------------------------------------------------------------------------- /fqi/networks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | import numpy as np 6 | 7 | 8 | class Network(nn.Module): 9 | def __init__(self, input_shape, output_shape, use_cuda, features, 10 | dropout, n_features=5): 11 | super().__init__() 12 | 13 | self._n_input = input_shape 14 | self._n_output = output_shape 15 | self._n_games = len(self._n_input) 16 | self._use_cuda = use_cuda 17 | self._n_shared = 2 18 | self._features = features 19 | 20 | self._h1 = nn.ModuleList( 21 | [nn.Linear(self._n_input[i][0], n_features) for i in range( 22 | len(input_shape))] 23 | ) 24 | self._h2 = nn.Linear(n_features, n_features) 25 | self._q = nn.ModuleList( 26 | [nn.Linear(n_features, self._n_output[i][0]) for i in range( 27 | self._n_games)] 28 | ) 29 | 30 | self.weights_init() 31 | 32 | def forward(self, state, action=None, idx=None): 33 | state = state.float() 34 | 35 | h1 = list() 36 | for i in np.unique(idx): 37 | idxs = np.argwhere(idx == i).ravel() 38 | h1.append(torch.sigmoid(self._h1[i](state[idxs, :self._n_input[i][0]]))) 39 | cat_h1 = torch.cat(h1) 40 | 41 | if self._features == 'relu': 42 | h_f = F.relu(self._h2(cat_h1)) 43 | elif self._features == 'sigmoid': 44 | h_f = torch.sigmoid(self._h2(cat_h1)) 45 | else: 46 | raise ValueError 47 | 48 | q = [torch.tanh(self._q[i](h_f)) for i in range(self._n_games)] 49 | q = torch.stack(q, dim=1) 50 | 51 | if action is not None: 52 | action = action.long() 53 | q_acted = torch.squeeze( 54 | q.gather(2, action.repeat(1, self._n_games).unsqueeze(-1)), -1) 55 | 56 | q = q_acted 57 | 58 | if idx is not None: 59 | idx = torch.from_numpy(idx) 60 | if self._use_cuda: 61 | idx = idx.cuda() 62 | if q.dim() == 2: 63 | q_idx = q.gather(1, idx.unsqueeze(-1)) 64 | else: 65 | q_idx = q.gather(1, idx.view(-1, 1).repeat( 66 | 1, self._n_output[0][0]).unsqueeze(1)) 67 | 68 | q = torch.squeeze(q_idx, 1) 69 | 70 | return q 71 | 72 | def get_shared_weights(self): 73 | p2 = list() 74 | 75 | for p in self._h2.parameters(): 76 | p2.append(p.data.detach().cpu().numpy()) 77 | 78 | return p2 79 | 80 | def weights_init(self): 81 | nn.init.xavier_uniform_(self._h2.weight, 82 | gain=nn.init.calculate_gain('relu')) 83 | for i in range(self._n_games): 84 | nn.init.xavier_uniform_(self._h1[i].weight, 85 | gain=nn.init.calculate_gain('relu')) 86 | nn.init.xavier_uniform_(self._q[i].weight, 87 | gain=nn.init.calculate_gain('linear')) 88 | 89 | def set_shared_weights(self, weights): 90 | w2 = weights 91 | 92 | for p, w in zip(self._h2.parameters(), w2): 93 | w_tensor = torch.from_numpy(w).type(p.data.dtype) 94 | if self._use_cuda: 95 | w_tensor = w_tensor.cuda() 96 | p.data = w_tensor 97 | 98 | def freeze_shared_weights(self): 99 | for p in self._h2.parameters(): 100 | p.requires_grad = False 101 | 102 | def unfreeze_shared_weights(self): 103 | for p in self._h2.parameters(): 104 | p.requires_grad = True 105 | -------------------------------------------------------------------------------- /fqi/run_coh.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import pickle 3 | import sys 4 | 5 | import numpy as np 6 | from joblib import Parallel, delayed 7 | import torch.optim as optim 8 | from tqdm import trange 9 | 10 | sys.path.append('..') 11 | 12 | from mushroom.approximators.parametric.torch_approximator import TorchApproximator 13 | from mushroom.utils.dataset import compute_J, parse_dataset 14 | from mushroom.utils.parameters import Parameter 15 | 16 | from car_on_hill import CarOnHill 17 | from core import Core 18 | from fqi import FQI 19 | from losses import LossFunction 20 | from networks import Network 21 | from policy import EpsGreedyMultiple 22 | from solver import solve_car_on_hill 23 | 24 | """ 25 | This script aims to replicate the experiments on the Car on Hill MDP as 26 | presented in: 27 | "Tree-Based Batch Mode Reinforcement Learning", Ernst D. et al.. 2005. 28 | 29 | """ 30 | 31 | 32 | def get_stats(dataset, gamma): 33 | J = np.mean(compute_J(dataset, gamma)) 34 | 35 | return J 36 | 37 | 38 | def experiment(mdp, test_states, test_actions, test_q, names): 39 | np.random.seed() 40 | 41 | n_games = len(mdp) 42 | input_shape = [(m.info.observation_space.shape[0],) for m in mdp] 43 | n_actions_per_head = [(m.info.action_space.n,) for m in mdp] 44 | 45 | test_states = np.array([test_states]).repeat(len(mdp), 0).reshape(-1, 2) 46 | test_actions = np.array([test_actions]).repeat(len(mdp), 0).reshape(-1, 1) 47 | test_idxs = np.ones(len(test_states), dtype=np.int) * np.arange(len(mdp)).repeat( 48 | len(test_states) // len(mdp), 0) 49 | 50 | # Policy 51 | epsilon = Parameter(value=1.) 52 | pi = EpsGreedyMultiple(parameter=epsilon, 53 | n_actions_per_head=n_actions_per_head) 54 | 55 | # Approximator 56 | optimizer = {'class': optim.Adam, 'params': dict()} 57 | loss = LossFunction(n_games) 58 | 59 | approximator_params = dict( 60 | network=Network, 61 | input_shape=input_shape, 62 | output_shape=n_actions_per_head, 63 | optimizer=optimizer, 64 | loss=loss, 65 | features='sigmoid', 66 | n_features=30, 67 | use_cuda=True, 68 | quiet=False 69 | ) 70 | 71 | approximator = TorchApproximator 72 | 73 | dataset = list() 74 | len_datasets = list() 75 | for i in range(len(mdp)): 76 | d = pickle.load(open('dataset_%s.pkl' % names[i], 'rb')) 77 | len_datasets.append(len(d)) 78 | dataset += d 79 | 80 | # Agent 81 | algorithm_params = dict(n_iterations=1, 82 | n_actions_per_head=n_actions_per_head, 83 | fit_params=dict(patience=100, epsilon=1e-6)) 84 | agent = FQI(approximator, pi, mdp[0].info, 85 | approximator_params=approximator_params, **algorithm_params) 86 | 87 | qs = list() 88 | scores = list() 89 | 90 | idxs = list() 91 | for i, l in enumerate(len_datasets): 92 | idxs += (np.ones(l, dtype=np.int) * i).tolist() 93 | idxs = np.array(idxs) 94 | 95 | state, action, reward, next_state, absorbing, _ = parse_dataset(dataset) 96 | for _ in trange(50, dynamic_ncols=True, disable=False, leave=False): 97 | agent._fit(state, action, reward, next_state, absorbing, idxs) 98 | # Algorithm 99 | core = Core(agent, mdp) 100 | test_epsilon = Parameter(0.) 101 | pi.set_parameter(test_epsilon) 102 | dataset = core.evaluate(n_steps=100) 103 | 104 | qs.append(agent.approximator.predict(test_states, test_actions, 105 | idx=test_idxs)) 106 | scores.append(np.mean(compute_J(dataset, mdp[0].info.gamma))) 107 | 108 | qs_hat = np.array(qs) 109 | avi_diff = list() 110 | for i in range(len(qs_hat)): 111 | avi_diff.append(np.linalg.norm(qs_hat[i] - test_q, ord=1) / len(test_q)) 112 | 113 | print(avi_diff, scores) 114 | 115 | return avi_diff, scores 116 | 117 | 118 | if __name__ == '__main__': 119 | n_exp = 100 120 | 121 | use_mdp = np.array([0]) # , 4, 8, 13, 1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 14, 15]) 122 | 123 | load_test_q = True 124 | 125 | # MDP 126 | all_mdps = [ 127 | CarOnHill(1, 9.81, 4), CarOnHill(1, 9.81, 4.125), CarOnHill(1, 9.81, 4.25), CarOnHill(1, 9.81, 4.375), 128 | CarOnHill(.8, 9.81, 4), CarOnHill(.85, 9.81, 4), CarOnHill(.9, 9.81, 4), CarOnHill(.95, 9.81, 4), 129 | CarOnHill(1, 9.81, 4.5), CarOnHill(1.05, 9.81, 4.5), CarOnHill(1.1, 9.81, 4.5), CarOnHill(1.15, 9.81, 4.5), 130 | CarOnHill(1.2, 9.81, 4.375), CarOnHill(1.2, 9.81, 4.5), CarOnHill(1.2, 9.81, 4.625), CarOnHill(1.2, 9.81, 4.75) 131 | ] 132 | 133 | mdp = list() 134 | for i in use_mdp: 135 | mdp.append(all_mdps[i]) 136 | 137 | names = ['%1.3f_%1.3f' % (m._m, m._discrete_actions[-1]) for m in mdp] 138 | 139 | test_states_0 = np.linspace(mdp[0].info.observation_space.low[0], 140 | mdp[0].info.observation_space.high[0], 10) 141 | test_states_1 = np.linspace(mdp[0].info.observation_space.low[1], 142 | mdp[0].info.observation_space.high[1], 10) 143 | test_states = list() 144 | for s0 in test_states_0: 145 | for s1 in test_states_1: 146 | test_states += [s0, s1] 147 | test_states = np.array([test_states]).repeat(2, 0).reshape(-1, 2) 148 | test_actions = np.array( 149 | [np.zeros(len(test_states) // 2), 150 | np.ones(len(test_states) // 2)]).reshape(-1, 1).astype(np.int) 151 | 152 | # Test Q 153 | test_q = list() 154 | if not load_test_q: 155 | for i, j in enumerate(use_mdp): 156 | current_test_q = solve_car_on_hill(all_mdps[j], test_states, 157 | test_actions, 158 | all_mdps[j].info.gamma) 159 | np.save('test_q_%s.npy' % names[i], current_test_q) 160 | 161 | test_q += current_test_q 162 | else: 163 | for i in range(len(mdp)): 164 | test_q += np.load('test_q_%s.npy' % names[i]).tolist() 165 | 166 | test_q = np.array(test_q) 167 | 168 | folder_name = './logs/%s/' % ''.join(names) 169 | pathlib.Path(folder_name).mkdir(parents=True, exist_ok=True) 170 | 171 | out = Parallel(n_jobs=8)(delayed(experiment)( 172 | mdp, test_states, test_actions, test_q, names) for i in range(n_exp)) 173 | 174 | avi_diff = np.array([o[0] for o in out]) 175 | scores = np.array([o[1] for o in out]) 176 | 177 | np.save(folder_name + 'avi_diff.npy', avi_diff) 178 | np.save(folder_name + 'scores.npy', scores) 179 | -------------------------------------------------------------------------------- /fqi/solver.py: -------------------------------------------------------------------------------- 1 | def step(mdp, state, action): 2 | mdp.reset(state) 3 | 4 | return mdp.step(action) 5 | 6 | 7 | def bfs(mdp, frontier, k, max_k): 8 | if len(frontier) == 0 or k == max_k: 9 | return False, k 10 | 11 | new_frontier = list() 12 | for f in frontier: 13 | s, r, _, _ = step(mdp, f, [0]) 14 | if r == 1: 15 | return True, k 16 | elif r == 0: 17 | new_frontier.append(s) 18 | 19 | s, r, _, _ = step(mdp, f, [1]) 20 | if r == 1: 21 | return True, k 22 | elif r == 0: 23 | new_frontier.append(s) 24 | 25 | return bfs(mdp, new_frontier, k + 1, max_k) 26 | 27 | 28 | def solve_car_on_hill(mdp, states, actions, gamma, max_k=50): 29 | q = list() 30 | for s, a in zip(states, actions): 31 | mdp.reset(s) 32 | state, reward, _, _ = mdp.step(a) 33 | 34 | if reward == 1: 35 | k = 1 36 | success = True 37 | elif reward == -1: 38 | k = 1 39 | success = False 40 | else: 41 | success, k = bfs(mdp, [state], 2, max_k) 42 | 43 | if success: 44 | q.append(gamma ** (k - 1)) 45 | else: 46 | q.append(-gamma ** (k - 1)) 47 | 48 | return q 49 | -------------------------------------------------------------------------------- /fqi/test_q_0.800_4.000.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_0.800_4.000.npy -------------------------------------------------------------------------------- /fqi/test_q_0.850_4.000.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_0.850_4.000.npy -------------------------------------------------------------------------------- /fqi/test_q_0.900_4.000.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_0.900_4.000.npy -------------------------------------------------------------------------------- /fqi/test_q_0.950_4.000.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_0.950_4.000.npy -------------------------------------------------------------------------------- /fqi/test_q_1.000_4.000.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.000.npy -------------------------------------------------------------------------------- /fqi/test_q_1.000_4.125.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.125.npy -------------------------------------------------------------------------------- /fqi/test_q_1.000_4.250.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.250.npy -------------------------------------------------------------------------------- /fqi/test_q_1.000_4.375.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.375.npy -------------------------------------------------------------------------------- /fqi/test_q_1.000_4.500.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.000_4.500.npy -------------------------------------------------------------------------------- /fqi/test_q_1.050_4.500.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.050_4.500.npy -------------------------------------------------------------------------------- /fqi/test_q_1.100_4.500.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.100_4.500.npy -------------------------------------------------------------------------------- /fqi/test_q_1.150_4.500.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.150_4.500.npy -------------------------------------------------------------------------------- /fqi/test_q_1.200_4.375.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.200_4.375.npy -------------------------------------------------------------------------------- /fqi/test_q_1.200_4.500.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.200_4.500.npy -------------------------------------------------------------------------------- /fqi/test_q_1.200_4.625.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.200_4.625.npy -------------------------------------------------------------------------------- /fqi/test_q_1.200_4.750.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/fqi/test_q_1.200_4.750.npy -------------------------------------------------------------------------------- /policy.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import numpy as np 4 | 5 | from mushroom_rl.policy import ParametricPolicy, TDPolicy 6 | from mushroom_rl.utils.parameters import Parameter 7 | 8 | 9 | class Multiple(TDPolicy): 10 | def __init__(self, parameter, n_actions_per_head): 11 | super().__init__() 12 | 13 | assert isinstance(parameter, Parameter) and\ 14 | isinstance(n_actions_per_head, list) or isinstance(n_actions_per_head, 15 | np.ndarray) 16 | self._n_actions_per_head = n_actions_per_head 17 | 18 | n_heads = len(n_actions_per_head) 19 | 20 | if isinstance(parameter, list): 21 | self._explorative_pars = deepcopy(parameter) 22 | else: 23 | self._explorative_pars = [deepcopy(parameter) for _ in range(n_heads)] 24 | self._pars = [None] * n_heads 25 | 26 | def set_parameter(self, parameter): 27 | assert isinstance(parameter, Parameter) or parameter is None 28 | 29 | if parameter is None: 30 | for i in range(len(self._pars)): 31 | self._pars[i] = self._explorative_pars[i] 32 | else: 33 | for i in range(len(self._pars)): 34 | self._pars[i] = parameter 35 | 36 | def update(self, state): 37 | idx = state[0] 38 | self._pars[idx].update(state) 39 | 40 | 41 | class EpsGreedyMultiple(Multiple): 42 | def __call__(self, *args): 43 | idx = args[0] 44 | state = np.array(args[1]) 45 | q = self._approximator.predict( 46 | np.expand_dims(state, axis=0), 47 | idx=idx).ravel()[:self._n_actions_per_head[idx][0]] 48 | max_a = np.argwhere(q == np.max(q)).ravel() 49 | 50 | p = self._epsilon.get_value(state) / self._n_actions_per_head[idx][0] 51 | 52 | if len(args) == 2: 53 | action = args[1] 54 | if action in max_a: 55 | return p + (1. - self._epsilon.get_value(state)) / len(max_a) 56 | else: 57 | return p 58 | else: 59 | probs = np.ones(self._n_actions_per_head[idx][0]) * p 60 | probs[max_a] += (1. - self._epsilon.get_value(state)) / len(max_a) 61 | 62 | return probs 63 | 64 | def draw_action(self, state): 65 | idx = state[0] 66 | state = np.array(state[1]) 67 | if not np.random.uniform() < self._pars[idx](state): 68 | q = self._approximator.predict( 69 | state, idx=np.array([idx]))[:self._n_actions_per_head[idx][0]] 70 | max_a = np.argwhere(q == np.max(q)).ravel() 71 | 72 | if len(max_a) > 1: 73 | max_a = np.array([np.random.choice( 74 | max_a[max_a < self._n_actions_per_head[idx][0]] 75 | )]) 76 | 77 | return max_a 78 | 79 | return np.array([np.random.choice(self._n_actions_per_head[idx][0])]) 80 | 81 | 82 | class OrnsteinUhlenbeckPolicy(ParametricPolicy): 83 | def __init__(self, mu, sigma, theta, dt, n_actions_per_head, 84 | max_action_value, x0=None): 85 | 86 | self._approximator = mu 87 | self._sigma = sigma 88 | self._theta = theta 89 | self._dt = dt 90 | self._max_action_value = max_action_value 91 | self._x0 = x0 92 | 93 | self._n_games = len(n_actions_per_head) 94 | 95 | self._n_actions_per_head = n_actions_per_head 96 | 97 | self.eval = None 98 | 99 | def __call__(self, state, action): 100 | raise NotImplementedError 101 | 102 | def draw_action(self, state): 103 | idx = state[0] 104 | state = state[1] 105 | mu = self._approximator.predict(state, idx=np.array([idx])) * self._max_action_value[idx] 106 | 107 | x = self._x_prev[idx] - self._theta * self._x_prev[idx] * self._dt + self._sigma *\ 108 | np.sqrt(self._dt) * np.random.normal(size=self._approximator.output_shape) 109 | self._x_prev[idx] = x 110 | 111 | if not self.eval: 112 | return mu[:self._n_actions_per_head[idx][0]] + x[:self._n_actions_per_head[idx][0]] 113 | else: 114 | return mu[:self._n_actions_per_head[idx][0]] 115 | 116 | def set_weights(self, weights): 117 | self._approximator.set_weights(weights) 118 | 119 | def get_weights(self): 120 | return self._approximator.get_weights() 121 | 122 | @property 123 | def weights_size(self): 124 | return self._approximator.weights_size 125 | 126 | def reset(self): 127 | self._x_prev = list() 128 | for i in range(self._n_games): 129 | self._x_prev.append(self._x0 if self._x0 is not None else np.zeros(self._approximator.output_shape)) 130 | -------------------------------------------------------------------------------- /replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from mushroom_rl.utils.replay_memory import PrioritizedReplayMemory, ReplayMemory, SumTree 4 | 5 | 6 | class ReplayMemory(ReplayMemory): 7 | def add(self, dataset): 8 | for i in range(len(dataset)): 9 | self._states[self._idx] = dataset[i][0][1] 10 | self._actions[self._idx] = dataset[i][1] 11 | self._rewards[self._idx] = dataset[i][2] 12 | self._next_states[self._idx] = dataset[i][3][1] 13 | self._absorbing[self._idx] = dataset[i][4] 14 | self._last[self._idx] = dataset[i][5] 15 | 16 | self._idx += 1 17 | if self._idx == self._max_size: 18 | self._full = True 19 | self._idx = 0 20 | 21 | 22 | class PrioritizedReplayMemory(PrioritizedReplayMemory): 23 | def __init__(self, initial_size, max_size, alpha, beta, 24 | epsilon=.01): 25 | self._initial_size = initial_size 26 | self._max_size = max_size 27 | self._alpha = alpha 28 | self._beta = beta 29 | self._epsilon = epsilon 30 | 31 | self._tree = SumTree(max_size) 32 | 33 | def get(self, n_samples): 34 | states = [None for _ in range(n_samples)] 35 | actions = [None for _ in range(n_samples)] 36 | rewards = [None for _ in range(n_samples)] 37 | next_states = [None for _ in range(n_samples)] 38 | absorbing = [None for _ in range(n_samples)] 39 | last = [None for _ in range(n_samples)] 40 | 41 | idxs = np.zeros(n_samples, dtype=np.int) 42 | priorities = np.zeros(n_samples) 43 | 44 | total_p = self._tree.total_p 45 | segment = total_p / n_samples 46 | 47 | a = np.arange(n_samples) * segment 48 | b = np.arange(1, n_samples + 1) * segment 49 | samples = np.random.uniform(a, b) 50 | for i, s in enumerate(samples): 51 | idx, p, data = self._tree.get(s) 52 | 53 | idxs[i] = idx 54 | priorities[i] = p 55 | states[i], actions[i], rewards[i], next_states[i], absorbing[i],\ 56 | last[i] = data 57 | states[i] = np.array(states[i][1]) 58 | next_states[i] = np.array(next_states[i][1]) 59 | 60 | sampling_probabilities = priorities / self._tree.total_p 61 | is_weight = (self._tree.size * sampling_probabilities) ** -self._beta() 62 | is_weight /= is_weight.max() 63 | 64 | return np.array(states), np.array(actions), np.array(rewards),\ 65 | np.array(next_states), np.array(absorbing), np.array(last),\ 66 | idxs, is_weight 67 | -------------------------------------------------------------------------------- /results/ddpg/multi_pendulum/noreg-sigmoid/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_pendulum/noreg-sigmoid/scores.npy -------------------------------------------------------------------------------- /results/ddpg/multi_pendulum/transfer/noreg/unfreeze0-noreg-sigmoid.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_pendulum/transfer/noreg/unfreeze0-noreg-sigmoid.npy -------------------------------------------------------------------------------- /results/ddpg/multi_pendulum/transfer/noreg/unfreeze101-noreg-sigmoid.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_pendulum/transfer/noreg/unfreeze101-noreg-sigmoid.npy -------------------------------------------------------------------------------- /results/ddpg/multi_pendulum/transfer/noreg/w.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_pendulum/transfer/noreg/w.pkl -------------------------------------------------------------------------------- /results/ddpg/multi_walker/noreg-sigmoid/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_walker/noreg-sigmoid/scores.npy -------------------------------------------------------------------------------- /results/ddpg/multi_walker/transfer/noreg/unfreeze0-noreg-sigmoid.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_walker/transfer/noreg/unfreeze0-noreg-sigmoid.npy -------------------------------------------------------------------------------- /results/ddpg/multi_walker/transfer/noreg/unfreeze101-noreg-sigmoid.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_walker/transfer/noreg/unfreeze101-noreg-sigmoid.npy -------------------------------------------------------------------------------- /results/ddpg/multi_walker/transfer/noreg/w.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/multi_walker/transfer/noreg/w.pkl -------------------------------------------------------------------------------- /results/ddpg/scores-plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as st 3 | from matplotlib import pyplot as plt 4 | 5 | 6 | def get_mean_and_confidence(data): 7 | mean = np.mean(data, axis=0) 8 | se = st.sem(data, axis=0) 9 | n = len(data) 10 | 11 | interval, _ = st.t.interval(0.95, n-1, scale=se) 12 | 13 | return mean, interval 14 | 15 | show_pendulum = False 16 | leg_idx = 0 if show_pendulum else -1 17 | 18 | if show_pendulum: 19 | alg = 'multi_pendulum' 20 | games = ['InvertedPendulumBulletEnv-v0', 'InvertedDoublePendulumBulletEnv-v0', 21 | 'InvertedPendulumSwingupBulletEnv-v0'] 22 | titles = ['Inverted-Pendulum', 'Inverted-Double-Pendulum', 'Inverted-Pendulum-Swingup'] 23 | else: 24 | alg = 'multi_walker' 25 | games = ['hop_stand', 'walk_walk', 'chee_run'] 26 | titles = ['Hopper', 'Walker', 'Half-Cheetah'] 27 | 28 | reg = ['noreg'] 29 | activation = ['sigmoid'] 30 | 31 | n_games = len(games) 32 | 33 | legend_items = list() 34 | 35 | fig, ax = plt.subplots(1, n_games) 36 | for i, t in enumerate(titles): 37 | ax[i].set_title(t, fontsize=22) 38 | ax[i].grid() 39 | 40 | for r in reg: 41 | for act in activation: 42 | name = r + '-' + act 43 | legend_items.append('single ' + name) 44 | for i, g in enumerate(games): 45 | path = 'single/' + name + '/' + g 46 | a = np.load(path + '.npy') 47 | a_mean, a_err = get_mean_and_confidence(a) 48 | ax[i].plot(a_mean[0], linewidth=3) 49 | ax[i].fill_between(np.arange(len(a_mean[0])), 50 | a_mean[0] - a_err[0], 51 | a_mean[0] + a_err[0], alpha=.5) 52 | 53 | if alg != '': 54 | for r in reg: 55 | for act in activation: 56 | name = r + '-' + act 57 | path = alg + '/' + name + '/' 58 | 59 | legend_items.append(name) 60 | a = np.load(path + 'scores.npy') 61 | a_mean, a_err = get_mean_and_confidence(a) 62 | for i, g in enumerate(games): 63 | ax[i].plot(a_mean[i], linewidth=3) 64 | ax[i].fill_between(np.arange(len(a_mean[i])), a_mean[i] - a_err[i], a_mean[i] + a_err[i], alpha=.5) 65 | ax[i].set_xlabel('#Epochs', fontsize=22) 66 | if i == 0: 67 | ax[i].set_ylabel('Performance', fontsize=22) 68 | for tick in ax[i].xaxis.get_major_ticks(): 69 | tick.label.set_fontsize(22) 70 | tick.label 71 | for tick in ax[i].yaxis.get_major_ticks(): 72 | tick.label.set_fontsize(22) 73 | ax[i].set_xticks([0, 50, 100]) 74 | 75 | ax[leg_idx].legend(['DDPG', 'MULTI'], loc='lower right', fontsize=22) 76 | plt.show() 77 | -------------------------------------------------------------------------------- /results/ddpg/single/noreg-sigmoid/InvertedDoublePendulumBulletEnv-v0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/InvertedDoublePendulumBulletEnv-v0.npy -------------------------------------------------------------------------------- /results/ddpg/single/noreg-sigmoid/InvertedPendulumBulletEnv-v0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/InvertedPendulumBulletEnv-v0.npy -------------------------------------------------------------------------------- /results/ddpg/single/noreg-sigmoid/InvertedPendulumSwingupBulletEnv-v0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/InvertedPendulumSwingupBulletEnv-v0.npy -------------------------------------------------------------------------------- /results/ddpg/single/noreg-sigmoid/chee_run.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/chee_run.npy -------------------------------------------------------------------------------- /results/ddpg/single/noreg-sigmoid/hop_stand.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/hop_stand.npy -------------------------------------------------------------------------------- /results/ddpg/single/noreg-sigmoid/walk_walk.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/ddpg/single/noreg-sigmoid/walk_walk.npy -------------------------------------------------------------------------------- /results/ddpg/transfer-plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as st 3 | from matplotlib import pyplot as plt 4 | 5 | 6 | def get_mean_and_confidence(data): 7 | mean = np.mean(data, axis=0) 8 | se = st.sem(data, axis=0) 9 | n = len(data) 10 | 11 | interval, _ = st.t.interval(0.95, n-1, scale=se) 12 | 13 | return mean, interval 14 | 15 | show_pendulum = False 16 | leg_idx = 0 if show_pendulum else -1 17 | 18 | if show_pendulum: 19 | alg = 'multi_pendulum' 20 | game = 'InvertedDoublePendulumBulletEnv-v0' 21 | title = 'Inverted-Double-Pendulum' 22 | else: 23 | alg = 'multi_walker' 24 | game = 'hop_stand' 25 | title = 'Hopper' 26 | 27 | games = ['noreg'] 28 | game_ids = [0] 29 | reg = ['noreg'] 30 | activation = ['sigmoid'] 31 | n_games = len(games) 32 | unfreezes = [0, 101] 33 | 34 | legend_items = list() 35 | 36 | fig, ax = plt.subplots(n_games, 1) 37 | # for i, g in enumerate(games): 38 | # ax.set_title(g) 39 | # ax.grid() 40 | 41 | for act in activation: 42 | for r in reg: 43 | legend_items.append('No initialization') 44 | path = 'single/' + r + '-' + act + '/' + game 45 | a = np.load(path + '.npy') 46 | a_mean, a_err = get_mean_and_confidence(a) 47 | for i, idx in enumerate(game_ids): 48 | ax.plot(a_mean[idx], linewidth=3) 49 | ax.fill_between(np.arange(101), a_mean[idx] - a_err[idx], a_mean[idx] + a_err[idx], alpha=.5) 50 | 51 | for u in unfreezes: 52 | for i, g in zip(game_ids, games): 53 | if u == 101: 54 | legend_items.append('No unfreeze') 55 | else: 56 | legend_items.append('Unfreeze-' + str(u)) 57 | file_path = alg + '/transfer' + '/' + g + '/unfreeze' + str(u) + '-' + r + '-' + act + '.npy' 58 | 59 | a = np.load(file_path) 60 | a_mean, a_err = get_mean_and_confidence(a) 61 | ax.plot(a_mean[0], linewidth=3) 62 | ax.fill_between(np.arange(101), a_mean[0] - a_err[0], a_mean[0] + a_err[0], alpha=.5) 63 | 64 | plt.xlabel('#Epochs', fontsize=35) 65 | plt.ylabel('Performance', fontsize=35) 66 | plt.xticks([0,50,100], fontsize=35) 67 | plt.yticks(fontsize=35) 68 | 69 | plt.grid() 70 | 71 | plt.title(title, fontsize=35) 72 | 73 | plt.legend(legend_items, fontsize=35, loc='best') 74 | 75 | plt.show() 76 | -------------------------------------------------------------------------------- /results/dqn/dqn/noreg-sigmoid/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/dqn/noreg-sigmoid/scores.npy -------------------------------------------------------------------------------- /results/dqn/multidqn/noreg-sigmoid/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/multidqn/noreg-sigmoid/scores.npy -------------------------------------------------------------------------------- /results/dqn/scores_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as st 3 | from matplotlib import pyplot as plt 4 | 5 | 6 | def get_mean_and_confidence(data): 7 | mean = np.mean(data, axis=0) 8 | se = st.sem(data, axis=0) 9 | n = len(data) 10 | 11 | interval, _ = st.t.interval(0.95, n-1, scale=se) 12 | 13 | return mean, interval 14 | 15 | folders = ['dqn', 'multidqn'] 16 | games = ['Cart-Pole', 'Acrobot', 'Mountain-Car', 'Car-On-Hill', 'Inverted-Pendulum'] 17 | reg = ['noreg'] 18 | activation = ['sigmoid'] 19 | n_games = len(games) 20 | n_settings = len(reg) * len(activation) 21 | 22 | # plt.suptitle('DQN VS MULTI') 23 | 24 | for i, g in enumerate(games): 25 | j = 1 26 | for act in activation: 27 | for r in reg: 28 | s = r + '-' + act 29 | plt.subplot(n_settings, n_games, i * n_settings + j) 30 | plt.title(g, fontsize=20) 31 | 32 | single = np.load('dqn/' + s + '/scores.npy')[:, i] 33 | single_mean, single_err = get_mean_and_confidence(single) 34 | 35 | multi = np.load('multidqn/' + s + '/scores.npy')[:, i] 36 | multi_mean, multi_err = get_mean_and_confidence(multi) 37 | 38 | plt.plot(single_mean, linewidth=3) 39 | plt.fill_between(np.arange(51), single_mean - single_err, single_mean + single_err, alpha=.5) 40 | 41 | plt.plot(multi_mean, linewidth=3) 42 | plt.fill_between(np.arange(51), multi_mean - multi_err, multi_mean + multi_err, alpha=.5) 43 | 44 | plt.xlabel('#Epochs', fontsize=20) 45 | 46 | plt.xticks([0, 25, 50], fontsize=20) 47 | plt.yticks(fontsize=20) 48 | 49 | if i == 0: 50 | plt.ylabel('Performance', fontsize=20) 51 | 52 | plt.grid() 53 | 54 | # plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) 55 | 56 | j += 1 57 | 58 | plt.legend(['DQN', 'MULTI'], fontsize=20, loc='lower right') 59 | 60 | plt.show() 61 | 62 | -------------------------------------------------------------------------------- /results/dqn/transfer-plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as st 3 | from matplotlib import pyplot as plt 4 | 5 | 6 | def get_mean_and_confidence(data): 7 | mean = np.mean(data, axis=0) 8 | se = st.sem(data, axis=0) 9 | n = len(data) 10 | 11 | interval, _ = st.t.interval(0.95, n-1, scale=se) 12 | 13 | return mean, interval 14 | 15 | games = ['acro-noreg'] 16 | game_ids = [1] 17 | reg = ['noreg'] 18 | activation = ['sigmoid'] 19 | n_games = len(games) 20 | unfreezes = [0, 10, 51] 21 | 22 | legend_items = list() 23 | 24 | fig, ax = plt.subplots(n_games, 1) 25 | # for i, g in enumerate(games): 26 | # ax.set_title(g) 27 | # ax.grid() 28 | 29 | for act in activation: 30 | for r in reg: 31 | legend_items.append('No initialization') 32 | path = 'dqn/' + r + '-' + act + '/' 33 | a = np.load(path + 'scores.npy') 34 | a_mean, a_err = get_mean_and_confidence(a) 35 | for i, idx in enumerate(game_ids): 36 | ax.plot(a_mean[idx], linewidth=3) 37 | ax.fill_between(np.arange(51), a_mean[idx] - a_err[idx], a_mean[idx] + a_err[idx], alpha=.5) 38 | 39 | for u in unfreezes: 40 | for i, g in zip(game_ids, games): 41 | if u == 51: 42 | legend_items.append('No unfreeze') 43 | else: 44 | legend_items.append('Unfreeze-' + str(u)) 45 | file_path = 'transfer' + '/' + g + '/unfreeze' + str(u) + '-' + r + '-' + act + '.npy' 46 | 47 | a = np.load(file_path) 48 | a_mean, a_err = get_mean_and_confidence(a) 49 | ax.plot(a_mean[0], linewidth=3) 50 | ax.fill_between(np.arange(51), a_mean[0] - a_err[0], a_mean[0] + a_err[0], alpha=.5) 51 | 52 | plt.xlabel('#Epochs', fontsize=35) 53 | plt.ylabel('Performance', fontsize=35) 54 | plt.xticks([0, 25, 50], fontsize=35) 55 | plt.yticks(fontsize=35) 56 | plt.title('Acrobot', fontsize=35) 57 | 58 | plt.grid() 59 | 60 | plt.legend(legend_items, fontsize=25, loc='lower right') 61 | 62 | plt.show() 63 | 64 | -------------------------------------------------------------------------------- /results/dqn/transfer/acro-noreg/noreg-cart_mc_coh_pend.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/transfer/acro-noreg/noreg-cart_mc_coh_pend.pkl -------------------------------------------------------------------------------- /results/dqn/transfer/acro-noreg/unfreeze0-noreg-sigmoid.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/transfer/acro-noreg/unfreeze0-noreg-sigmoid.npy -------------------------------------------------------------------------------- /results/dqn/transfer/acro-noreg/unfreeze10-noreg-sigmoid.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/transfer/acro-noreg/unfreeze10-noreg-sigmoid.npy -------------------------------------------------------------------------------- /results/dqn/transfer/acro-noreg/unfreeze51-noreg-sigmoid.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/dqn/transfer/acro-noreg/unfreeze51-noreg-sigmoid.npy -------------------------------------------------------------------------------- /results/fqi/0.800_4.000/avi_diff.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/0.800_4.000/avi_diff.npy -------------------------------------------------------------------------------- /results/fqi/0.800_4.000/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/0.800_4.000/scores.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.000/avi_diff.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.000/avi_diff.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.000/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.000/scores.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.0000.800_4.000/avi_diff.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.000/avi_diff.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.0000.800_4.000/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.000/scores.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.500/avi_diff.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.500/avi_diff.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.500/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.500/scores.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000/avi_diff.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000/avi_diff.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.0000.800_4.0001.000_4.5001.200_4.5001.000_4.1251.000_4.2501.000_4.3750.850_4.000/scores.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.500/avi_diff.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.500/avi_diff.npy -------------------------------------------------------------------------------- /results/fqi/1.000_4.500/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.000_4.500/scores.npy -------------------------------------------------------------------------------- /results/fqi/1.200_4.500/avi_diff.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.200_4.500/avi_diff.npy -------------------------------------------------------------------------------- /results/fqi/1.200_4.500/scores.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carloderamo/shared/ffb8f45bb4d17c46882a95d09a012173a1cb1649/results/fqi/1.200_4.500/scores.npy -------------------------------------------------------------------------------- /results/fqi/avi_scores_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as st 3 | from matplotlib import pyplot as plt 4 | 5 | 6 | def get_mean_and_confidence(data): 7 | mean = np.mean(data, axis=0) 8 | se = st.sem(data, axis=0) 9 | n = len(data) 10 | 11 | interval, _ = st.t.interval(0.95, n - 1, scale=se) 12 | 13 | return mean, interval 14 | 15 | games = ['1.000_4.000', '0.800_4.000', '1.000_4.500', '1.200_4.500'] 16 | 17 | plt.subplot(1, 2, 1) 18 | a = list() 19 | for g in games: 20 | a.append(np.load(g + '/avi_diff.npy')) 21 | a = np.array(a) 22 | 23 | fs = 25 24 | 25 | a_mean, a_err = get_mean_and_confidence(a.mean(0)) 26 | plt.ylabel(r'$\Vert Q^* - Q^{\pi_K}\Vert$', fontsize=fs) 27 | plt.xlabel('# Iterations', fontsize=fs) 28 | plt.xticks([0, 25, 50], fontsize=fs) 29 | plt.yticks(fontsize=fs) 30 | plt.plot(a_mean, linewidth=3) 31 | plt.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5) 32 | 33 | a = np.load(''.join(games) + '/avi_diff.npy') 34 | 35 | a_mean, a_err = get_mean_and_confidence(a) 36 | plt.plot(a_mean, linewidth=3) 37 | plt.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5) 38 | plt.grid() 39 | plt.legend(['FQI', 'MULTI'], fontsize=fs) 40 | 41 | plt.subplot(1, 2, 2) 42 | a = list() 43 | for g in games: 44 | a.append(np.load(g + '/scores.npy')) 45 | a = np.array(a) 46 | 47 | a_mean, a_err = get_mean_and_confidence(a.mean(0)) 48 | plt.ylabel('Performance', fontsize=fs) 49 | plt.xlabel('# Iterations', fontsize=fs) 50 | plt.xticks([0, 25, 50], fontsize=fs) 51 | plt.yticks(fontsize=fs) 52 | plt.plot(a_mean, linewidth=3) 53 | plt.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5) 54 | 55 | a = np.load(''.join(games) + '/scores.npy') 56 | 57 | a_mean, a_err = get_mean_and_confidence(a) 58 | plt.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5) 59 | plt.plot(a_mean, linewidth=3) 60 | plt.grid() 61 | 62 | plt.show() 63 | 64 | -------------------------------------------------------------------------------- /results/fqi/multi_avi_scores_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as st 3 | from matplotlib import pyplot as plt 4 | from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes 5 | from mpl_toolkits.axes_grid1.inset_locator import mark_inset 6 | 7 | 8 | def get_mean_and_confidence(data): 9 | mean = np.mean(data, axis=0) 10 | se = st.sem(data, axis=0) 11 | n = len(data) 12 | 13 | interval, _ = st.t.interval(0.95, n - 1, scale=se) 14 | 15 | return mean, interval 16 | 17 | games = ['1.000_4.000', '0.800_4.000', '1.000_4.500', '1.200_4.500', '1.000_4.125', '1.000_4.250', '1.000_4.375', '0.850_4.000'] 18 | n_tasks = [1, 2, 4, 8] 19 | style = ['-', '-.', '--', ':'] 20 | 21 | fig, ax = plt.subplots() 22 | for j, i in enumerate(n_tasks): 23 | a = np.load(''.join(games[:i]) + '/avi_diff.npy') 24 | 25 | a_mean, a_err = get_mean_and_confidence(a) 26 | ax.plot(a_mean, linewidth=3, linestyle=style[j]) 27 | ax.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5) 28 | 29 | fs = 25 30 | 31 | plt.xticks([0, 25, 50], fontsize=fs) 32 | plt.yticks(fontsize=fs) 33 | plt.ylabel(r'$\Vert Q^* - Q^{\pi_K}\Vert$', fontsize=fs) 34 | plt.xlabel('# Iterations', fontsize=fs) 35 | plt.grid() 36 | plt.legend(n_tasks, fontsize=fs) 37 | 38 | axins = zoomed_inset_axes(ax, 2, loc=9) # zoom-factor: 2.5, location: upper-left 39 | mark_inset(ax, axins, loc1=4, loc2=3, fc="none", ec="0.5") 40 | for j, i in enumerate(n_tasks): 41 | a = np.load(''.join(games[:i]) + '/avi_diff.npy') 42 | 43 | a_mean, a_err = get_mean_and_confidence(a) 44 | axins.plot(a_mean, linewidth=3, linestyle=style[j]) 45 | axins.fill_between(np.arange(a_mean.shape[-1]), a_mean - a_err, a_mean + a_err, alpha=.5) 46 | x1, x2, y1, y2 = 40, 49, .155, .225 # specify the limits 47 | axins.set_xlim(x1, x2) # apply the x-limits 48 | axins.set_ylim(y1, y2) # apply the y-limits 49 | axins.grid() 50 | axins.set_xticks([]) 51 | axins.set_yticks([]) 52 | 53 | plt.show() 54 | 55 | --------------------------------------------------------------------------------