├── source
    ├── agents
    │   ├── __init__.py
    │   ├── ql.py
    │   ├── sfql.py
    │   ├── buffer.py
    │   ├── sfdqn.py
    │   ├── dqn.py
    │   └── agent.py
    ├── features
    │   ├── __init__.py
    │   ├── tabular.py
    │   ├── deep.py
    │   └── successor.py
    ├── tasks
    │   ├── __init__.py
    │   ├── task.py
    │   ├── reacher.py
    │   └── gridworld.py
    ├── utils
    │   ├── __init__.py
    │   ├── config.py
    │   └── stats.py
    ├── figures
    │   ├── sfql_return.png
    │   └── sfdqn_return.png
    ├── configs
    │   ├── reacher.cfg
    │   └── gridworld.cfg
    ├── main_sfql.py
    └── main_sfdqn.py
├── .settings
    └── org.eclipse.core.resources.prefs
├── LICENSE
├── README.md
└── .gitignore


/source/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/features/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/source/figures/sfql_return.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mike-gimelfarb/deep-successor-features-for-transfer/HEAD/source/figures/sfql_return.png


--------------------------------------------------------------------------------
/source/figures/sfdqn_return.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mike-gimelfarb/deep-successor-features-for-transfer/HEAD/source/figures/sfdqn_return.png


--------------------------------------------------------------------------------
/source/utils/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from ast import literal_eval
 3 | from collections import defaultdict
 4 | import configparser
 5 | import os
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | def parse_config_file(name):
10 |     raw_path = Path(__file__).parent.parent
11 |     config_path = os.path.join(raw_path, 'configs', name)
12 |     config = configparser.RawConfigParser()
13 |     config.optionxform = str 
14 |     config.read(config_path)
15 |     section_dict = defaultdict()
16 |     for section in config.sections():
17 |         section_dict[section] = {k: literal_eval(v) for k, v in config.items(section)}
18 |     return section_dict
19 | 


--------------------------------------------------------------------------------
/source/utils/stats.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | class OnlineMeanVariance:
 6 |     
 7 |     def __init__(self):
 8 |         self.count = 0
 9 |         self.mean = 0.
10 |         self.M2 = 0.
11 |     
12 |     def update(self, x):
13 |         x = np.array(x)
14 |         self.count += 1
15 |         delta = x - self.mean
16 |         self.mean = self.mean + delta / float(self.count)
17 |         delta2 = x - self.mean
18 |         self.M2 = self.M2 + delta * delta2
19 |     
20 |     def calculate_variance(self):
21 |         return self.M2 / (self.count - 1.)
22 |     
23 |     def calculate_standard_error(self):
24 |         return np.sqrt(self.calculate_variance() / float(self.count))
25 | 
26 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | encoding//source/agents/agent.py=UTF-8
 3 | encoding//source/agents/buffer.py=UTF-8
 4 | encoding//source/agents/dqn.py=UTF-8
 5 | encoding//source/agents/ql.py=UTF-8
 6 | encoding//source/agents/sfdqn.py=UTF-8
 7 | encoding//source/agents/sfql.py=UTF-8
 8 | encoding//source/features/deep.py=UTF-8
 9 | encoding//source/features/successor.py=UTF-8
10 | encoding//source/features/tabular.py=UTF-8
11 | encoding//source/main_sfdqn.py=UTF-8
12 | encoding//source/main_sfql.py=UTF-8
13 | encoding//source/tasks/gridworld.py=UTF-8
14 | encoding//source/tasks/reacher.py=UTF-8
15 | encoding//source/tasks/task.py=UTF-8
16 | encoding//source/utils/config.py=UTF-8
17 | encoding//source/utils/stats.py=UTF-8
18 | 


--------------------------------------------------------------------------------
/source/configs/reacher.cfg:
--------------------------------------------------------------------------------
 1 | [GENERAL]
 2 | n_samples=100000
 3 | 
 4 | [TASK]
 5 | train_targets=[(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14)]
 6 | test_targets=[(0.22, 0.0), (-0.22, 0.0), (0.0, 0.22), (0.0, -0.22), (0.1, 0.1), (0.1, -0.1), (-0.1, 0.1), (-0.1, -0.1)]
 7 | 
 8 | [AGENT]
 9 | gamma=0.9
10 | epsilon=0.1
11 | test_epsilon=0.03
12 | T=500
13 | print_ev=1000
14 | save_ev=200
15 | n_test_ev=1000
16 | encoding="task"
17 | 
18 | [DQN]
19 | target_update_ev=1000
20 | keras_params={
21 |    "n_neurons" : [256, 256],
22 |    "activations" : ["relu", "relu"],
23 |    "learning_rate" : 0.001}
24 | buffer_params={
25 |    "n_samples" : 1000000,
26 |    "n_batch" : 32}
27 | 
28 | [SFDQN]
29 | learning_rate_w=0.5
30 | use_true_reward=True
31 | use_gpi=True
32 | target_update_ev=1000
33 | keras_params={
34 |    "n_neurons" : [256, 256],
35 |    "activations" : ["relu", "relu"],
36 |    "learning_rate" : 0.001}
37 | buffer_params={
38 |    "n_samples" : 1000000,
39 |    "n_batch" : 32}


--------------------------------------------------------------------------------
/source/agents/ql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from collections import defaultdict
 3 | import numpy as np
 4 | 
 5 | from agents.agent import Agent
 6 | 
 7 | 
 8 | class QL(Agent):
 9 |     
10 |     def __init__(self, learning_rate, *args, **kwargs):
11 |         """
12 |         Creates a new tabular Q-learning agent.
13 |         
14 |         Parameters
15 |         ----------
16 |         learning_rate : float
17 |             the learning rate to use in order to update Q-values
18 |         """
19 |         super(QL, self).__init__(*args, **kwargs)
20 |         self.alpha = learning_rate
21 |         
22 |     def get_Q_values(self, s, s_enc):
23 |         return self.Q[s]
24 |     
25 |     def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma):
26 |         target = r + gamma * np.max(self.Q[s1])
27 |         error = target - self.Q[s][a]
28 |         self.Q[s][a] += self.alpha * error
29 |         
30 |     def set_active_training_task(self, index):
31 |         super(QL, self).set_active_training_task(index)
32 |         self.Q = defaultdict(lambda: np.random.uniform(low=-0.01, high=0.01, size=(self.n_actions,)))
33 | 
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Successor Features Framework: Copyright (c) 2021 Mike Gimelfarb
 4 | Wrappers to the PyBullet-Gym Package: Copyright (c) 2018 Benjamin Ellenberger
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/source/configs/gridworld.cfg:
--------------------------------------------------------------------------------
 1 | [GENERAL]
 2 | n_samples=20000
 3 | n_tasks=20
 4 | n_trials=20
 5 | 
 6 | [TASK]
 7 | maze=[
 8 |     ['1', ' ', ' ', ' ', ' ', '2', 'X', ' ', ' ', ' ', ' ', ' ', 'G'],
 9 |     [' ', ' ', ' ', ' ', ' ', ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' '],
10 |     [' ', ' ', ' ', ' ', ' ', ' ', '1', ' ', ' ', ' ', ' ', ' ', ' '],
11 |     [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
12 |     [' ', ' ', ' ', ' ', ' ', ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' '],
13 |     ['2', ' ', ' ', ' ', ' ', '3', 'X', ' ', ' ', ' ', ' ', ' ', ' '],
14 |     ['X', 'X', '3', ' ', 'X', 'X', 'X', 'X', 'X', ' ', '1', 'X', 'X'],
15 |     [' ', ' ', ' ', ' ', ' ', ' ', 'X', '2', ' ', ' ', ' ', ' ', '3'],
16 |     [' ', ' ', ' ', ' ', ' ', ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' '],
17 |     [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
18 |     [' ', ' ', ' ', ' ', ' ', ' ', '2', ' ', ' ', ' ', ' ', ' ', ' '],
19 |     [' ', ' ', ' ', ' ', ' ', ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' '],
20 |     ['_', ' ', ' ', ' ', ' ', ' ', 'X', '3', ' ', ' ', ' ', ' ', '1']]
21 | 
22 | [AGENT]
23 | gamma=0.95
24 | epsilon=0.15
25 | T=200
26 | print_ev=2000
27 | save_ev=200
28 | encoding=None
29 | 
30 | [SFQL]
31 | learning_rate=0.5
32 | learning_rate_w=0.5
33 | use_true_reward=False
34 | 
35 | [QL]
36 | learning_rate=0.5


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # successor-features-for-transfer
 2 | A reusable framework and independent implementation for successor features (SF) for transfer in (deep) reinforcement learning using keras, based on [1].
 3 | 
 4 | Discrete four-room domain:
 5 | 
 6 | ![](https://github.com/mike-gimelfarb/successor-features-for-transfer/blob/main/source/figures/sfql_return.png)
 7 | 
 8 | Deep learning for reacher domain (MuJoCo):
 9 | 
10 | ![](https://github.com/mike-gimelfarb/successor-features-for-transfer/blob/main/source/figures/sfdqn_return.png)
11 | 
12 | Currently supports:
13 | - tabular SF representations for discrete environments, based on an efficient hash table representation
14 | - deep neural network SF representations for large or continuous-state environments, based on keras; allows existing keras models or custom architectures (e.g. CNNs) as inputs for easy training and tuning
15 | - tasks with pre-defined state features only, although support for training features on-the-fly may be added later
16 | - tasks structured according to the OpenAI gym framework
17 | 
18 | # Requirements
19 | - python 3.8 or later
20 | - tensorflow 2.3 or later
21 | - pybullet 3.0.8 and pybullet-gym 0.1 (for reacher domain)
22 | 
23 | # References
24 | [1] Barreto, André, et al. "Successor features for transfer in reinforcement learning." Advances in neural information processing systems. 2017.
25 | [2] Dayan, Peter. "Improving generalization for temporal difference learning: The successor representation." Neural Computation 5.4 (1993): 613-624.
26 | 


--------------------------------------------------------------------------------
/source/features/tabular.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from collections import defaultdict
 3 | from copy import deepcopy
 4 | import numpy as np
 5 | 
 6 | from features.successor import SF
 7 | 
 8 | 
 9 | class TabularSF(SF):
10 |     """
11 |     A successor feature representation implemented using lookup tables. Storage is lazy and implemented efficiently
12 |     using defaultdict.
13 |     """
14 |     
15 |     def __init__(self, learning_rate, *args, 
16 |                  noise_init=lambda size: np.random.uniform(-0.01, 0.01, size=size), **kwargs):
17 |         """
18 |         Creates a new tabular representation of successor features.
19 |         
20 |         Parameters
21 |         ----------
22 |         learning_rate : float
23 |             the learning rate
24 |         noise_init : function 
25 |             instruction to initialize action-values, defaults to Uniform[-0.01, 0.01]
26 |         """
27 |         super(TabularSF, self).__init__(*args, **kwargs)
28 |         self.alpha = learning_rate
29 |         self.noise_init = noise_init
30 |     
31 |     def build_successor(self, task, source=None):
32 |         if source is None or len(self.psi) == 0:
33 |             n_actions = task.action_count()
34 |             n_features = task.feature_dim()
35 |             return defaultdict(lambda: self.noise_init((n_actions, n_features)))
36 |         else:
37 |             return deepcopy(self.psi[source])
38 |                 
39 |     def get_successor(self, state, policy_index):
40 |         return np.expand_dims(self.psi[policy_index][state], axis=0)
41 |         
42 |     def get_successors(self, state):
43 |         return np.expand_dims(np.array([psi[state] for psi in self.psi]), axis=0)
44 | 
45 |     def update_successor(self, transitions, policy_index):
46 |         for state, action, phi, next_state, next_action, gamma in transitions:
47 |             psi = self.psi[policy_index]
48 |             targets = phi.flatten() + gamma * psi[next_state][next_action,:] 
49 |             errors = targets - psi[state][action,:]
50 |             psi[state][action,:] = psi[state][action,:] + self.alpha * errors
51 |     
52 | 


--------------------------------------------------------------------------------
/source/agents/sfql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import numpy as np
 3 | 
 4 | from agents.agent import Agent
 5 | 
 6 | 
 7 | class SFQL(Agent):
 8 |     
 9 |     def __init__(self, lookup_table, *args, use_gpi=True, **kwargs):
10 |         """
11 |         Creates a new tabular successor feature agent.
12 |         
13 |         Parameters
14 |         ----------
15 |         lookup_table : TabularSF
16 |             a tabular successor feature representation
17 |         use_gpi : boolean
18 |             whether or not to use transfer learning (defaults to True)
19 |         """
20 |         super(SFQL, self).__init__(*args, **kwargs)
21 |         self.sf = lookup_table
22 |         self.use_gpi = use_gpi
23 |         
24 |     def get_Q_values(self, s, s_enc):
25 |         q, self.c = self.sf.GPI(s_enc, self.task_index, update_counters=self.use_gpi)
26 |         if not self.use_gpi:
27 |             self.c = self.task_index
28 |         return q[:, self.c,:]
29 |     
30 |     def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma):
31 |         
32 |         # update w
33 |         t = self.task_index
34 |         phi = self.phi(s, a, s1)
35 |         self.sf.update_reward(phi, r, t)
36 |         
37 |         # update SF for the current task t
38 |         if self.use_gpi:
39 |             q1, _ = self.sf.GPI(s1_enc, t)
40 |             q1 = np.max(q1[0,:,:], axis=0)
41 |         else:
42 |             q1 = self.sf.GPE(s1_enc, t, t)[0,:]
43 |         next_action = np.argmax(q1)
44 |         transitions = [(s_enc, a, phi, s1_enc, next_action, gamma)]
45 |         self.sf.update_successor(transitions, t)
46 |         
47 |         # update SF for source task c
48 |         if self.c != t:
49 |             q1 = self.sf.GPE(s1_enc, self.c, self.c)
50 |             next_action = np.argmax(q1)
51 |             transitions = [(s_enc, a, phi, s1_enc, next_action, gamma)]
52 |             self.sf.update_successor(transitions, self.c)
53 |     
54 |     def reset(self):
55 |         super(SFQL, self).reset()
56 |         self.sf.reset()
57 |         
58 |     def add_training_task(self, task):
59 |         super(SFQL, self).add_training_task(task)
60 |         self.sf.add_training_task(task, -1)
61 |     
62 |     def get_progress_strings(self):
63 |         sample_str, reward_str = super(SFQL, self).get_progress_strings()
64 |         gpi_percent = self.sf.GPI_usage_percent(self.task_index)
65 |         w_error = np.linalg.norm(self.sf.fit_w[self.task_index] - self.sf.true_w[self.task_index])
66 |         gpi_str = 'GPI% \t {:.4f} \t w_err \t {:.4f}'.format(gpi_percent, w_error)
67 |         return sample_str, reward_str, gpi_str
68 |     
69 | 


--------------------------------------------------------------------------------
/source/main_sfql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | from agents.sfql import SFQL
 6 | from agents.ql import QL
 7 | from features.tabular import TabularSF
 8 | from tasks.gridworld import Shapes
 9 | from utils.config import parse_config_file
10 | from utils.stats import OnlineMeanVariance
11 | 
12 | # general training params
13 | config_params = parse_config_file('gridworld.cfg')
14 | gen_params = config_params['GENERAL']
15 | task_params = config_params['TASK']
16 | agent_params = config_params['AGENT']
17 | sfql_params = config_params['SFQL']
18 | ql_params = config_params['QL']
19 | 
20 | 
21 | # tasks
22 | def generate_task():
23 |     rewards = dict(zip(['1', '2', '3'], list(np.random.uniform(low=-1.0, high=1.0, size=3))))
24 |     return Shapes(maze=np.array(task_params['maze']), shape_rewards=rewards)
25 |  
26 | 
27 | # agents
28 | sfql = SFQL(TabularSF(**sfql_params), **agent_params) 
29 | ql = QL(**agent_params, **ql_params)
30 | agents = [sfql, ql]
31 | names = ['SFQL', 'QLearning']
32 | 
33 | # train
34 | data_task_return = [OnlineMeanVariance() for _ in agents]
35 | n_trials = gen_params['n_trials']
36 | n_samples = gen_params['n_samples']
37 | n_tasks = gen_params['n_tasks']
38 | for trial in range(n_trials):
39 |     
40 |     # train each agent on a set of tasks
41 |     for agent in agents:
42 |         agent.reset()
43 |     for t in range(n_tasks):
44 |         task = generate_task()
45 |         for agent, name in zip(agents, names):
46 |             print('\ntrial {}, solving with {}'.format(trial, name))
47 |             agent.train_on_task(task, n_samples)
48 |              
49 |     # update performance statistics 
50 |     for i, agent in enumerate(agents):
51 |         data_task_return[i].update(agent.reward_hist)
52 | 
53 | # plot the task return
54 | ticksize = 14
55 | textsize = 18
56 | figsize = (20, 10)
57 | 
58 | plt.rc('font', size=textsize)  # controls default text sizes
59 | plt.rc('axes', titlesize=textsize)  # fontsize of the axes title
60 | plt.rc('axes', labelsize=textsize)  # fontsize of the x and y labels
61 | plt.rc('xtick', labelsize=ticksize)  # fontsize of the tick labels
62 | plt.rc('ytick', labelsize=ticksize)  # fontsize of the tick labels
63 | plt.rc('legend', fontsize=ticksize)  # legend fontsize
64 | 
65 | plt.figure(figsize=(12, 6))
66 | ax = plt.gca()
67 | for i, name in enumerate(names):
68 |     mean = data_task_return[i].mean
69 |     n_sample_per_tick = n_samples * n_tasks // mean.size
70 |     x = np.arange(mean.size) * n_sample_per_tick
71 |     se = data_task_return[i].calculate_standard_error()
72 |     plt.plot(x, mean, label=name)
73 |     ax.fill_between(x, mean - se, mean + se, alpha=0.3)
74 | plt.xlabel('sample')
75 | plt.ylabel('cumulative reward')
76 | plt.title('Cumulative Training Reward Per Task')
77 | plt.tight_layout()
78 | plt.legend(ncol=2, frameon=False)
79 | plt.savefig('figures/sfql_return.png')
80 | 


--------------------------------------------------------------------------------
/source/agents/buffer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | class ReplayBuffer:
 6 |     
 7 |     def __init__(self, *args, n_samples=1000000, n_batch=32, **kwargs):
 8 |         """
 9 |         Creates a new randomized replay buffer.
10 |         
11 |         Parameters
12 |         ----------
13 |         n_samples : integer
14 |             the maximum number of samples that can be stored in the buffer
15 |         n_batch : integer
16 |             the batch size
17 |         """
18 |         self.n_samples = n_samples
19 |         self.n_batch = n_batch
20 |     
21 |     def reset(self):
22 |         """
23 |         Removes all samples currently stored in the buffer.
24 |         """
25 |         self.buffer = np.empty(self.n_samples, dtype=object)
26 |         self.index = 0
27 |         self.size = 0
28 |     
29 |     def replay(self):
30 |         """
31 |         Samples a batch of samples from the buffer randomly. If the number of samples
32 |         currently in the buffer is less than the batch size, returns None.
33 |         
34 |         Returns
35 |         -------
36 |         states : np.ndarray
37 |             a collection of starting states of shape [n_batch, -1]
38 |         actions : np.ndarray
39 |             a collection of actions taken in the starting states of shape [n_batch,]
40 |         rewards : np.ndarray:
41 |             a collection of rewards (for DQN) or features (for SFDQN) obtained of shape [n_batch, -1]
42 |         next_states : np.ndarray
43 |             a collection of successor states of shape [n_batch, -1]
44 |         gammas : np.ndarray
45 |             a collection of discount factors to be applied in computing targets for training of shape [n_batch,]
46 |         """
47 |         if self.size < self.n_batch: return None
48 |         indices = np.random.randint(low=0, high=self.size, size=(self.n_batch,))
49 |         states, actions, rewards, next_states, gammas = zip(*self.buffer[indices])
50 |         states = np.vstack(states)
51 |         actions = np.array(actions)
52 |         rewards = np.vstack(rewards)
53 |         next_states = np.vstack(next_states)
54 |         gammas = np.array(gammas)
55 |         return states, actions, rewards, next_states, gammas
56 |     
57 |     def append(self, state, action, reward, next_state, gamma):
58 |         """
59 |         Adds the specified sample to the replay buffer. If the buffer is full, then the earliest added
60 |         sample is removed, and the new sample is added.
61 |         
62 |         Parameters
63 |         ----------
64 |         state : np.ndarray
65 |             the encoded state of the task
66 |         action : integer
67 |             the action taken in state
68 |         reward : float or np.ndarray
69 |             the reward obtained in the current transition (for DQN) or state features (for SFDQN)
70 |         next_state : np.ndarray
71 |             the encoded successor state
72 |         gamma : floag
73 |             the effective discount factor to be applied in computing targets for training
74 |         """
75 |         self.buffer[self.index] = (state, action, reward, next_state, gamma)
76 |         self.size = min(self.size + 1, self.n_samples)
77 |         self.index = (self.index + 1) % self.n_samples
78 |         
79 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .metadata
132 | bin/
133 | tmp/
134 | *.tmp
135 | *.bak
136 | *.swp
137 | *~.nib
138 | local.properties
139 | .settings/
140 | .loadpath
141 | .recommenders
142 | 
143 | # External tool builders
144 | .externalToolBuilders/
145 | 
146 | # Locally stored "Eclipse launch configurations"
147 | *.launch
148 | 
149 | # PyDev specific (Python IDE for Eclipse)
150 | *.pydevproject
151 | 
152 | # CDT-specific (C/C++ Development Tooling)
153 | .cproject
154 | 
155 | # CDT- autotools
156 | .autotools
157 | 
158 | # Java annotation processor (APT)
159 | .factorypath
160 | 
161 | # PDT-specific (PHP Development Tools)
162 | .buildpath
163 | 
164 | # sbteclipse plugin
165 | .target
166 | 
167 | # Tern plugin
168 | .tern-project
169 | 
170 | # TeXlipse plugin
171 | .texlipse
172 | 
173 | # STS (Spring Tool Suite)
174 | .springBeans
175 | 
176 | # Code Recommenders
177 | .recommenders/
178 | 
179 | # Annotation Processing
180 | .apt_generated/
181 | .apt_generated_test/
182 | 
183 | # Scala IDE specific (Scala & Java development for Eclipse)
184 | .cache-main
185 | .scala_dependencies
186 | .worksheet
187 | 
188 | # Uncomment this line if you wish to ignore the project description file.
189 | # Typically, this file would be tracked if it contains build/dependency configurations:
190 | #.project
191 | 


--------------------------------------------------------------------------------
/source/tasks/task.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | class Task:
  3 |     """
  4 |     An abstract representation of an MDP with arbitrary state space and finite action space.
  5 |     """
  6 |     
  7 |     def clone(self):
  8 |         """
  9 |         Creates an identical copy of the current environment, for use in testing.
 10 |         
 11 |         Returns
 12 |         -------
 13 |         Task : the copy of the current task
 14 |         """
 15 |         raise NotImplementedError
 16 | 
 17 |     def initialize(self):
 18 |         """
 19 |         Resets the state of the environment.
 20 |         
 21 |         Returns
 22 |         -------
 23 |         object : the initial state of the MDP
 24 |         """
 25 |         raise NotImplementedError
 26 |     
 27 |     def action_count(self):
 28 |         """
 29 |         Returns the number of possible actions in the MDP.
 30 |         
 31 |         Returns
 32 |         -------
 33 |         integer : number of possible actions
 34 |         """
 35 |         raise NotImplementedError
 36 |     
 37 |     def transition(self, action):
 38 |         """
 39 |         Applies the specified action in the environment, updating the state of the MDP.
 40 |         
 41 |         Parameters
 42 |         ----------
 43 |         action : integer
 44 |             the action to apply to the environment
 45 |         
 46 |         Returns 
 47 |         -------
 48 |         object : the next state of the MDP
 49 |         float : the immediate reward observed in the transition
 50 |         boolean : whether or not a terminal state has been reached
 51 |         """
 52 |         raise NotImplementedError
 53 |     
 54 |     # ===========================================================================
 55 |     # STATE ENCODING FOR DEEP LEARNING
 56 |     # ===========================================================================
 57 |     def encode(self, state):
 58 |         """
 59 |         Encodes the state of the MDP according to its canonical encoding.
 60 |         
 61 |         Parameters
 62 |         ----------
 63 |         state : object
 64 |             the state of the MDP to encode
 65 |         
 66 |         Returns
 67 |         -------
 68 |         np.ndarray : the encoding of the state
 69 |         """
 70 |         raise NotImplementedError
 71 |     
 72 |     def encode_dim(self):
 73 |         """
 74 |         Returns the dimension of the canonical state encoding.
 75 |         
 76 |         Returns
 77 |         -------
 78 |         integer : the dimension of the canonical state encoding
 79 |         """
 80 |         raise NotImplementedError
 81 |     
 82 |     # ===========================================================================
 83 |     # SUCCESSOR FEATURES
 84 |     # ===========================================================================
 85 |     def features(self, state, action, next_state):
 86 |         """
 87 |         Computes the state features for the current environment, used for learning successor
 88 |         feature representations. First introduced in [1].
 89 |         
 90 |         Parameters
 91 |         ----------
 92 |         state : object
 93 |             the state of the MDP
 94 |         action : integer
 95 |             the action selected in the state
 96 |         next_state : object
 97 |             the next state (successor state) of the MDP
 98 |         
 99 |         Returns
100 |         -------
101 |         np.ndarray : the state features of the transition
102 |         
103 |         References
104 |         ----------
105 |         [1] Dayan, Peter. "Improving generalization for temporal difference learning: 
106 |         The successor representation." Neural Computation 5.4 (1993): 613-624.
107 |         """
108 |         raise NotImplementedError
109 |     
110 |     def feature_dim(self):
111 |         """
112 |         Returns the dimension of the state feature representation.
113 |         
114 |         Returns
115 |         -------
116 |         integer : the dimension of the state feature representation
117 |         """
118 |         raise NotImplementedError
119 |     
120 |     def get_w(self):
121 |         """
122 |         Returns a vector of parameters that represents the reward function for the current task.
123 |         Mathematically, given the state features phi(s,a,s') and reward parameters w, the reward function
124 |         is represented as r(s,a,s') = < phi(s,a,s'), w >. 
125 |         
126 |         Returns
127 |         -------
128 |         np.ndarray : a linear parameterization of the reward function of the current MDP
129 |         """
130 |         raise NotImplementedError
131 |     
132 | 


--------------------------------------------------------------------------------
/source/features/deep.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import numpy as np
  3 | from tensorflow.keras import backend as K, Model
  4 | from tensorflow.keras.layers import concatenate, Input, Lambda
  5 | 
  6 | from features.successor import SF
  7 | 
  8 | 
  9 | class DeepSF(SF):
 10 |     """
 11 |     A successor feature representation implemented using Keras. Accepts a wide variety of neural networks as
 12 |     function approximators.
 13 |     """
 14 |     
 15 |     def __init__(self, keras_model_handle, *args, target_update_ev=1000, **kwargs):
 16 |         """
 17 |         Creates a new deep representation of successor features.
 18 |         
 19 |         Parameters
 20 |         ----------
 21 |         keras_model_handle : function
 22 |             a function from an input tensor to a compiled Keras model for successor features
 23 |             the Keras model must have outputs reshaped to [None, n_actions, n_features], where
 24 |                 None corresponds to the batch dimension
 25 |                 n_actions is the number of actions of the MDP
 26 |                 n_features is the number of state features to learn SFs
 27 |         target_update_ev : integer 
 28 |             how often to update the target network, measured by the number of training calls
 29 |         """
 30 |         super(DeepSF, self).__init__(*args, **kwargs)
 31 |         self.keras_model_handle = keras_model_handle
 32 |         self.target_update_ev = target_update_ev
 33 |     
 34 |     def reset(self):
 35 |         SF.reset(self)
 36 |         self.updates_since_target_updated = []
 37 |         
 38 |     def build_successor(self, task, source=None):
 39 |         
 40 |         # input tensor for all networks is shared
 41 |         if self.n_tasks == 0:
 42 |             self.n_actions = task.action_count()
 43 |             self.n_features = task.feature_dim()
 44 |             self.inputs = Input(shape=(task.encode_dim(),))
 45 |             
 46 |         # build SF network and copy its weights from previous task
 47 |         # output shape is assumed to be [n_batch, n_actions, n_features]
 48 |         model = self.keras_model_handle(self.inputs)
 49 |         if source is not None and self.n_tasks > 0:
 50 |             source_psi, _ = self.psi[source]
 51 |             model.set_weights(source_psi.get_weights())
 52 |         
 53 |         # append predictions of all SF networks across tasks to allow fast prediction
 54 |         expand_output = Lambda(lambda x: K.expand_dims(x, axis=1))(model.output)
 55 |         if self.n_tasks == 0:
 56 |             self.all_outputs = expand_output
 57 |         else:
 58 |             self.all_outputs = concatenate([self.all_outputs, expand_output], axis=1)
 59 |         self.all_output_model = Model(inputs=self.inputs, outputs=self.all_outputs)
 60 |         self.all_output_model.compile('sgd', 'mse')  # dummy compile so Keras doesn't complain
 61 |         
 62 |         # build target model and copy the weights 
 63 |         target_model = self.keras_model_handle(self.inputs)
 64 |         target_model.set_weights(model.get_weights())
 65 |         self.updates_since_target_updated.append(0)
 66 |         
 67 |         return model, target_model
 68 |         
 69 |     def get_successor(self, state, policy_index):
 70 |         psi, _ = self.psi[policy_index]
 71 |         return psi.predict_on_batch(state)
 72 |     
 73 |     def get_successors(self, state):
 74 |         return self.all_output_model.predict_on_batch(state)
 75 |     
 76 |     def update_successor(self, transitions, policy_index):
 77 |         if transitions is None:
 78 |             return
 79 |         states, actions, phis, next_states, gammas = transitions
 80 |         n_batch = len(gammas)
 81 |         indices = np.arange(n_batch)
 82 |         gammas = gammas.reshape((-1, 1))
 83 |          
 84 |         # next actions come from GPI
 85 |         q1, _ = self.GPI(next_states, policy_index)
 86 |         next_actions = np.argmax(np.max(q1, axis=1), axis=-1)
 87 |         
 88 |         # compute the targets and TD errors
 89 |         psi, target_psi = self.psi[policy_index]
 90 |         current_psi = psi.predict_on_batch(states)
 91 |         targets = phis + gammas * target_psi.predict_on_batch(next_states)[indices, next_actions,:]
 92 |         
 93 |         # train the SF network
 94 |         current_psi[indices, actions,:] = targets
 95 |         psi.train_on_batch(states, current_psi)
 96 |         
 97 |         # update the target SF network
 98 |         self.updates_since_target_updated[policy_index] += 1
 99 |         if self.updates_since_target_updated[policy_index] >= self.target_update_ev:
100 |             target_psi.set_weights(psi.get_weights())
101 |             self.updates_since_target_updated[policy_index] = 0
102 | 
103 | 


--------------------------------------------------------------------------------
/source/agents/sfdqn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import random
  3 | import numpy as np
  4 | 
  5 | from agents.agent import Agent
  6 | 
  7 | 
  8 | class SFDQN(Agent):
  9 |     
 10 |     def __init__(self, deep_sf, buffer, *args, use_gpi=True, test_epsilon=0.03, **kwargs):
 11 |         """
 12 |         Creates a new SFDQN agent per the specifications in the original paper.
 13 |         
 14 |         Parameters
 15 |         ----------
 16 |         deep_sf : DeepSF
 17 |             instance of deep successor feature representation
 18 |          buffer : ReplayBuffer
 19 |             a replay buffer that implements randomized experience replay
 20 |         use_gpi : boolean
 21 |             whether or not to use transfer learning (defaults to True)
 22 |         test_epsilon : float
 23 |             the exploration parameter for epsilon greedy used during testing 
 24 |             (defaults to 0.03 as in the paper)
 25 |         """
 26 |         super(SFDQN, self).__init__(*args, **kwargs)
 27 |         self.sf = deep_sf
 28 |         self.buffer = buffer
 29 |         self.use_gpi = use_gpi
 30 |         self.test_epsilon = test_epsilon
 31 |         
 32 |     def get_Q_values(self, s, s_enc):
 33 |         q, c = self.sf.GPI(s_enc, self.task_index, update_counters=self.use_gpi)
 34 |         if not self.use_gpi:
 35 |             c = self.task_index
 36 |         self.c = c
 37 |         return q[:, c,:]
 38 |     
 39 |     def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma):
 40 |         
 41 |         # update w
 42 |         phi = self.phi(s, a, s1)
 43 |         self.sf.update_reward(phi, r, self.task_index)
 44 |         
 45 |         # remember this experience
 46 |         self.buffer.append(s_enc, a, phi, s1_enc, gamma)
 47 |         
 48 |         # update SFs
 49 |         transitions = self.buffer.replay()
 50 |         for index in range(self.n_tasks):
 51 |             self.sf.update_successor(transitions, index)
 52 |         
 53 |     def reset(self):
 54 |         super(SFDQN, self).reset()
 55 |         self.sf.reset()
 56 |         self.buffer.reset()
 57 | 
 58 |     def add_training_task(self, task):
 59 |         super(SFDQN, self).add_training_task(task)
 60 |         self.sf.add_training_task(task, source=None)
 61 |     
 62 |     def get_progress_strings(self):
 63 |         sample_str, reward_str = super(SFDQN, self).get_progress_strings()
 64 |         gpi_percent = self.sf.GPI_usage_percent(self.task_index)
 65 |         w_error = np.linalg.norm(self.sf.fit_w[self.task_index] - self.sf.true_w[self.task_index])
 66 |         gpi_str = 'GPI% \t {:.4f} \t w_err \t {:.4f}'.format(gpi_percent, w_error)
 67 |         return sample_str, reward_str, gpi_str
 68 |             
 69 |     def train(self, train_tasks, n_samples, viewers=None, n_view_ev=None, test_tasks=[], n_test_ev=1000):
 70 |         if viewers is None: 
 71 |             viewers = [None] * len(train_tasks)
 72 |             
 73 |         # add tasks
 74 |         self.reset()
 75 |         for train_task in train_tasks:
 76 |             self.add_training_task(train_task)
 77 |             
 78 |         # train each one
 79 |         return_data = []
 80 |         for index, (train_task, viewer) in enumerate(zip(train_tasks, viewers)):
 81 |             self.set_active_training_task(index)
 82 |             for t in range(n_samples):
 83 |                 
 84 |                 # train
 85 |                 self.next_sample(viewer, n_view_ev)
 86 |                 
 87 |                 # test
 88 |                 if t % n_test_ev == 0:
 89 |                     Rs = []
 90 |                     for test_task in test_tasks:
 91 |                         R = self.test_agent(test_task)
 92 |                         Rs.append(R)
 93 |                     print('test performance: {}'.format('\t'.join(map('{:.4f}'.format, Rs))))
 94 |                     avg_R = np.mean(Rs)
 95 |                     return_data.append(avg_R)
 96 |         return return_data
 97 |     
 98 |     def get_test_action(self, s_enc, w):
 99 |         if random.random() <= self.test_epsilon:
100 |             a = random.randrange(self.n_actions)
101 |         else:
102 |             q, c = self.sf.GPI_w(s_enc, w)
103 |             q = q[:, c,:]
104 |             a = np.argmax(q)
105 |         return a
106 |             
107 |     def test_agent(self, task):
108 |         R = 0.0
109 |         w = task.get_w()
110 |         s = task.initialize()
111 |         s_enc = self.encoding(s)
112 |         for _ in range(self.T):
113 |             a = self.get_test_action(s_enc, w)
114 |             s1, r, done = task.transition(a)
115 |             s1_enc = self.encoding(s1)
116 |             s, s_enc = s1, s1_enc
117 |             R += r
118 |             if done:
119 |                 break
120 |         return R
121 |     
122 | 


--------------------------------------------------------------------------------
/source/agents/dqn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import numpy as np
  3 | import random
  4 | 
  5 | from agents.agent import Agent
  6 | 
  7 | 
  8 | class DQN(Agent):
  9 |     
 10 |     def __init__(self, model_lambda, buffer, *args, target_update_ev=1000, test_epsilon=0.03, **kwargs):
 11 |         """
 12 |         Creates a new DQN agent that supports universal value function approximation (UVFA).
 13 |         
 14 |         Parameters
 15 |         ----------
 16 |         model_lambda : function
 17 |             returns a keras Model instance
 18 |         buffer : ReplayBuffer
 19 |             a replay buffer that implements randomized experience replay
 20 |         target_update_ev : integer
 21 |             how often to update the target network (defaults to 1000)
 22 |         test_epsilon : float
 23 |             the exploration parameter for epsilon greedy used during testing 
 24 |             (defaults to 0.03 as in the paper)
 25 |         """
 26 |         super(DQN, self).__init__(*args, **kwargs)
 27 |         self.model_lambda = model_lambda
 28 |         self.buffer = buffer
 29 |         self.target_update_ev = target_update_ev
 30 |         self.test_epsilon = test_epsilon
 31 |     
 32 |     def reset(self):
 33 |         Agent.reset(self)
 34 |         self.Q = self.model_lambda()
 35 |         self.target_Q = self.model_lambda()
 36 |         self.target_Q.set_weights(self.Q.get_weights())
 37 |         self.buffer.reset()
 38 |         self.updates_since_target_updated = 0
 39 |         
 40 |     def get_Q_values(self, s, s_enc):
 41 |         return self.Q.predict_on_batch(s_enc)
 42 |     
 43 |     def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma):
 44 |         
 45 |         # remember this experience
 46 |         self.buffer.append(s_enc, a, r, s1_enc, gamma)
 47 |         
 48 |         # sample experience at random
 49 |         batch = self.buffer.replay()
 50 |         if batch is None: return
 51 |         states, actions, rewards, next_states, gammas = batch
 52 |         n_batch = self.buffer.n_batch
 53 |         indices = np.arange(n_batch)
 54 |         rewards = rewards.flatten()
 55 | 
 56 |         # main update
 57 |         next_actions = np.argmax(self.Q.predict_on_batch(next_states), axis=1)
 58 |         targets = self.Q.predict_on_batch(states)
 59 |         targets[indices, actions] = rewards + \
 60 |             gammas * self.target_Q.predict_on_batch(next_states)[indices, next_actions]
 61 |         self.Q.train_on_batch(states, targets)
 62 |         
 63 |         # target update
 64 |         self.updates_since_target_updated += 1
 65 |         if self.updates_since_target_updated >= self.target_update_ev:
 66 |             self.target_Q.set_weights(self.Q.get_weights())
 67 |             self.updates_since_target_updated = 0
 68 |     
 69 |     def train(self, train_tasks, n_samples, viewers=None, n_view_ev=None, test_tasks=[], n_test_ev=1000):
 70 |         if viewers is None: 
 71 |             viewers = [None] * len(train_tasks)
 72 |             
 73 |         # add tasks
 74 |         self.reset()
 75 |         for train_task in train_tasks:
 76 |             self.add_training_task(train_task)
 77 |             
 78 |         # train each one
 79 |         return_data = []
 80 |         for index, (train_task, viewer) in enumerate(zip(train_tasks, viewers)):
 81 |             self.set_active_training_task(index)
 82 |             for t in range(n_samples):
 83 |                 
 84 |                 # train
 85 |                 self.next_sample(viewer, n_view_ev)
 86 |                 
 87 |                 # test
 88 |                 if t % n_test_ev == 0:
 89 |                     Rs = []
 90 |                     for test_task in test_tasks:
 91 |                         R = self.test_agent(test_task)
 92 |                         Rs.append(R)
 93 |                     avg_R = np.mean(Rs)
 94 |                     return_data.append(avg_R)
 95 |                     print('test performance: {}'.format('\t'.join(map('{:.4f}'.format, Rs))))
 96 |         return return_data
 97 |     
 98 |     def get_test_action(self, s_enc):
 99 |         if random.random() <= self.test_epsilon:
100 |             a = random.randrange(self.n_actions)
101 |         else:
102 |             q = self.get_Q_values(s_enc, s_enc)
103 |             a = np.argmax(q)
104 |         return a
105 |             
106 |     def test_agent(self, task):
107 |         R = 0.
108 |         s = task.initialize()
109 |         s_enc = self.encoding(s)
110 |         for _ in range(self.T):
111 |             a = self.get_test_action(s_enc)
112 |             s1, r, done = task.transition(a)
113 |             s1_enc = self.encoding(s1)
114 |             s, s_enc = s1, s1_enc
115 |             R += r
116 |             if done:
117 |                 break
118 |         return R
119 | 


--------------------------------------------------------------------------------
/source/main_sfdqn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-  
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | 
  5 | import tensorflow as tf
  6 | tf.compat.v1.disable_eager_execution()
  7 | for gpu in tf.config.experimental.list_physical_devices('GPU'):
  8 |     tf.config.experimental.set_memory_growth(gpu, True)
  9 | from tensorflow.keras import layers, Model, optimizers
 10 | 
 11 | from agents.dqn import DQN
 12 | from agents.sfdqn import SFDQN
 13 | from agents.buffer import ReplayBuffer
 14 | from features.deep import DeepSF
 15 | from tasks.reacher import Reacher
 16 | from utils.config import parse_config_file
 17 | 
 18 | # read parameters from config file
 19 | config_params = parse_config_file('reacher.cfg')
 20 | 
 21 | gen_params = config_params['GENERAL']
 22 | n_samples = gen_params['n_samples']
 23 | 
 24 | task_params = config_params['TASK']
 25 | goals = task_params['train_targets']
 26 | test_goals = task_params['test_targets']
 27 | all_goals = goals + test_goals
 28 |     
 29 | agent_params = config_params['AGENT']
 30 | dqn_params = config_params['DQN']
 31 | sfdqn_params = config_params['SFDQN']
 32 | 
 33 | 
 34 | # tasks
 35 | def generate_tasks(include_target):
 36 |     train_tasks = [Reacher(all_goals, i, include_target) for i in range(len(goals))]
 37 |     test_tasks = [Reacher(all_goals, i + len(goals), include_target) for i in range(len(test_goals))]
 38 |     return train_tasks, test_tasks
 39 | 
 40 | 
 41 | # keras model
 42 | def dqn_model_lambda():
 43 |     keras_params = dqn_params['keras_params']
 44 |     x = y = layers.Input(6)
 45 |     for n_neurons, activation in zip(keras_params['n_neurons'], keras_params['activations']):
 46 |         y = layers.Dense(n_neurons, activation=activation)(y)
 47 |     y = layers.Dense(9, activation='linear')(y)
 48 |     model = Model(inputs=x, outputs=y)
 49 |     sgd = optimizers.Adam(learning_rate=keras_params['learning_rate'])
 50 |     model.compile(sgd, 'mse')
 51 |     return model
 52 | 
 53 | 
 54 | # keras model for the SF
 55 | def sf_model_lambda(x):
 56 |     n_features = len(all_goals)
 57 |     keras_params = sfdqn_params['keras_params']
 58 |     y = x
 59 |     for n_neurons, activation in zip(keras_params['n_neurons'], keras_params['activations']):
 60 |         y = layers.Dense(n_neurons, activation=activation)(y)
 61 |     y = layers.Dense(9 * n_features, activation='linear')(y)
 62 |     y = layers.Reshape((9, n_features))(y)
 63 |     model = Model(inputs=x, outputs=y)
 64 |     sgd = optimizers.Adam(learning_rate=keras_params['learning_rate'])
 65 |     model.compile(sgd, 'mse')
 66 |     return model
 67 | 
 68 | 
 69 | def train():
 70 |     
 71 |     # build SFDQN    
 72 |     print('building SFDQN')
 73 |     deep_sf = DeepSF(keras_model_handle=sf_model_lambda, **sfdqn_params)
 74 |     sfdqn = SFDQN(deep_sf=deep_sf, buffer=ReplayBuffer(sfdqn_params['buffer_params']),
 75 |                   **sfdqn_params, **agent_params)
 76 |     
 77 |     # train SFDQN
 78 |     print('training SFDQN')
 79 |     train_tasks, test_tasks = generate_tasks(False)
 80 |     sfdqn_perf = sfdqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev'])
 81 |     
 82 |     # build DQN
 83 |     print('building DQN')
 84 |     dqn = DQN(model_lambda=dqn_model_lambda, buffer=ReplayBuffer(dqn_params['buffer_params']),
 85 |               **dqn_params, **agent_params)
 86 |     
 87 |     # training DQN
 88 |     print('training DQN')
 89 |     train_tasks, test_tasks = generate_tasks(True)
 90 |     dqn_perf = dqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev'])
 91 | 
 92 |     # smooth data    
 93 |     def smooth(y, box_pts):
 94 |         return np.convolve(y, np.ones(box_pts) / box_pts, mode='same')
 95 | 
 96 |     sfdqn_perf = smooth(sfdqn_perf, 10)[:-5]
 97 |     dqn_perf = smooth(dqn_perf, 10)[:-5]
 98 |     x = np.linspace(0, 4, sfdqn_perf.size)
 99 |     
100 |     # reporting progress
101 |     ticksize = 14
102 |     textsize = 18
103 |     plt.rc('font', size=textsize)  # controls default text sizes
104 |     plt.rc('axes', titlesize=textsize)  # fontsize of the axes title
105 |     plt.rc('axes', labelsize=textsize)  # fontsize of the x and y labels
106 |     plt.rc('xtick', labelsize=ticksize)  # fontsize of the tick labels
107 |     plt.rc('ytick', labelsize=ticksize)  # fontsize of the tick labels
108 |     plt.rc('legend', fontsize=ticksize)  # legend fontsize
109 | 
110 |     plt.figure(figsize=(8, 6))
111 |     ax = plt.gca()
112 |     ax.plot(x, sfdqn_perf, label='SFDQN')
113 |     ax.plot(x, dqn_perf, label='DQN')
114 |     plt.xlabel('training task index')
115 |     plt.ylabel('averaged test episode reward')
116 |     plt.title('Testing Reward Averaged over all Test Tasks')
117 |     plt.tight_layout()
118 |     plt.legend(frameon=False)
119 |     plt.savefig('figures/sfdqn_return.png')
120 | 
121 | 
122 | train()
123 | 


--------------------------------------------------------------------------------
/source/tasks/reacher.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import numpy as np
  3 | from pybulletgym.envs.roboschool.robots.robot_bases import MJCFBasedRobot
  4 | from pybulletgym.envs.roboschool.envs.env_bases import BaseBulletEnv
  5 | from pybulletgym.envs.roboschool.scenes.scene_bases import SingleRobotEmptyScene
  6 | 
  7 | from tasks.task import Task
  8 | 
  9 | 
 10 | class Reacher(Task):
 11 |     
 12 |     def __init__(self, target_positions, task_index, include_target_in_state=False):
 13 |         self.target_positions = target_positions
 14 |         self.task_index = task_index
 15 |         self.target_pos = target_positions[task_index]
 16 |         self.include_target_in_state = include_target_in_state
 17 |         self.env = ReacherBulletEnv(self.target_pos)
 18 |         
 19 |         # make the action lookup from integer to real action
 20 |         actions = [-1., 0., 1.]
 21 |         self.action_dict = dict()
 22 |         for a1 in actions:
 23 |             for a2 in actions:
 24 |                 self.action_dict[len(self.action_dict)] = (a1, a2)
 25 |         
 26 |     def clone(self):
 27 |         return Reacher(self.target_positions, self.task_index, self.include_target_in_state)
 28 |     
 29 |     def initialize(self):
 30 |         # if self.task_index == 0:
 31 |         #    self.env.render('human')
 32 |         state = self.env.reset()
 33 |         if self.include_target_in_state:
 34 |             return np.concatenate([state.flatten(), self.target_pos])
 35 |         else:
 36 |             return state
 37 |     
 38 |     def action_count(self):
 39 |         return len(self.action_dict)
 40 |     
 41 |     def transition(self, action):
 42 |         real_action = self.action_dict[action]
 43 |         new_state, reward, done, _ = self.env.step(real_action)
 44 |         
 45 |         if self.include_target_in_state:
 46 |             return_state = np.concatenate([new_state, self.target_pos])
 47 |         else:
 48 |             return_state = new_state
 49 |             
 50 |         return return_state, reward, done
 51 |     
 52 |     # ===========================================================================
 53 |     # STATE ENCODING FOR DEEP LEARNING
 54 |     # ===========================================================================
 55 |     def encode(self, state):
 56 |         return np.array(state).reshape((1, -1))
 57 |     
 58 |     def encode_dim(self):
 59 |         if self.include_target_in_state:
 60 |             return 6
 61 |         else:
 62 |             return 4
 63 |     
 64 |     # ===========================================================================
 65 |     # SUCCESSOR FEATURES
 66 |     # ===========================================================================
 67 |     def features(self, state, action, next_state):
 68 |         phi = np.zeros((len(self.target_positions),))
 69 |         for index, target in enumerate(self.target_positions):
 70 |             delta = np.linalg.norm(np.array(self.env.robot.fingertip.pose().xyz()[:2]) - np.array(target))
 71 |             phi[index] = 1. - 4. * delta
 72 |         return phi
 73 |     
 74 |     def feature_dim(self):
 75 |         return len(self.target_positions)
 76 |     
 77 |     def get_w(self):
 78 |         w = np.zeros((len(self.target_positions), 1))
 79 |         w[self.task_index, 0] = 1.0
 80 |         return w
 81 | 
 82 | 
 83 | class ReacherBulletEnv(BaseBulletEnv):
 84 | 
 85 |     def __init__(self, target):
 86 |         self.robot = ReacherRobot(target)
 87 |         BaseBulletEnv.__init__(self, self.robot)
 88 | 
 89 |     def create_single_player_scene(self, bullet_client):
 90 |         return SingleRobotEmptyScene(bullet_client, gravity=0.0, timestep=0.0165, frame_skip=1)
 91 | 
 92 |     def step(self, a):
 93 |         assert (not self.scene.multiplayer)
 94 |         self.robot.apply_action(a)
 95 |         self.scene.global_step()
 96 | 
 97 |         state = self.robot.calc_state()  # sets self.to_target_vec
 98 |         
 99 |         delta = np.linalg.norm(
100 |             np.array(self.robot.fingertip.pose().xyz()) - np.array(self.robot.target.pose().xyz()))
101 |         reward = 1. - 4. * delta
102 |         self.HUD(state, a, False)
103 |         
104 |         return state, reward, False, {}
105 | 
106 |     def camera_adjust(self):
107 |         x, y, z = self.robot.fingertip.pose().xyz()
108 |         x *= 0.5
109 |         y *= 0.5
110 |         self.camera.move_and_look_at(0.3, 0.3, 0.3, x, y, z)
111 | 
112 | 
113 | class ReacherRobot(MJCFBasedRobot):
114 |     TARG_LIMIT = 0.27
115 | 
116 |     def __init__(self, target):
117 |         MJCFBasedRobot.__init__(self, 'reacher.xml', 'body0', action_dim=2, obs_dim=4)
118 |         self.target_pos = target
119 | 
120 |     def robot_specific_reset(self, bullet_client):
121 |         self.jdict["target_x"].reset_current_position(self.target_pos[0], 0)
122 |         self.jdict["target_y"].reset_current_position(self.target_pos[1], 0)
123 |         self.fingertip = self.parts["fingertip"]
124 |         self.target = self.parts["target"]
125 |         self.central_joint = self.jdict["joint0"]
126 |         self.elbow_joint = self.jdict["joint1"]
127 |         self.central_joint.reset_current_position(self.np_random.uniform(low=-3.14, high=3.14), 0)
128 |         self.elbow_joint.reset_current_position(self.np_random.uniform(low=-3.14 / 2, high=3.14 / 2), 0)
129 | 
130 |     def apply_action(self, a):
131 |         assert (np.isfinite(a).all())
132 |         self.central_joint.set_motor_torque(0.05 * float(np.clip(a[0], -1, +1)))
133 |         self.elbow_joint.set_motor_torque(0.05 * float(np.clip(a[1], -1, +1)))
134 | 
135 |     def calc_state(self):
136 |         theta, self.theta_dot = self.central_joint.current_relative_position()
137 |         self.gamma, self.gamma_dot = self.elbow_joint.current_relative_position()
138 |         # target_x, _ = self.jdict["target_x"].current_position()
139 |         # target_y, _ = self.jdict["target_y"].current_position()
140 |         self.to_target_vec = np.array(self.fingertip.pose().xyz()) - np.array(self.target.pose().xyz())
141 |         return np.array([
142 |             theta,
143 |             self.theta_dot,
144 |             self.gamma,
145 |             self.gamma_dot
146 |         ])
147 | # 
148 | #     def calc_potential(self):
149 | #         return -100 * np.linalg.norm(self.to_target_vec)
150 | 
151 | 


--------------------------------------------------------------------------------
/source/tasks/gridworld.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import numpy as np
  3 | import random
  4 | 
  5 | from tasks.task import Task
  6 | 
  7 | 
  8 | class Shapes(Task):
  9 |     """
 10 |     A discretized version of the gridworld environment introduced in [1]. Here, an agent learns to 
 11 |     collect shapes with positive reward, while avoid those with negative reward, and then travel to a fixed goal.
 12 |     The gridworld is split into four rooms separated by walls with passage-ways.
 13 |     
 14 |     References
 15 |     ----------
 16 |     [1] Barreto, Andr�, et al. "Successor Features for Transfer in Reinforcement Learning." NIPS. 2017.
 17 |     """
 18 | 
 19 |     LEFT, UP, RIGHT, DOWN = 0, 1, 2, 3
 20 |  
 21 |     def __init__(self, maze, shape_rewards):
 22 |         """
 23 |         Creates a new instance of the shapes environment.
 24 |         
 25 |         Parameters
 26 |         ----------
 27 |         maze : np.ndarray
 28 |             an array of string values representing the type of each cell in the environment:
 29 |                 G indicates a goal state (terminal state)
 30 |                 _ indicates an initial state (there can be multiple, and one is selected at random
 31 |                     at the start of each episode)
 32 |                 X indicates a barrier 
 33 |                 0, 1, .... 9 indicates the type of shape to be placed in the corresponding cell
 34 |                 entries containing other characters are treated as regular empty cells
 35 |         shape_rewards : dict
 36 |             a dictionary mapping the type of shape (0, 1, ... ) to a corresponding reward to provide
 37 |             to the agent for collecting an object of that type
 38 |         """
 39 |         self.height, self.width = maze.shape
 40 |         self.maze = maze
 41 |         self.shape_rewards = shape_rewards
 42 |         shape_types = sorted(list(shape_rewards.keys()))
 43 |         self.all_shapes = dict(zip(shape_types, range(len(shape_types))))
 44 |         
 45 |         self.goal = None
 46 |         self.initial = []
 47 |         self.occupied = set()
 48 |         self.shape_ids = dict()
 49 |         for c in range(self.width):
 50 |             for r in range(self.height):
 51 |                 if maze[r, c] == 'G':
 52 |                     self.goal = (r, c)
 53 |                 elif maze[r, c] == '_':
 54 |                     self.initial.append((r, c))
 55 |                 elif maze[r, c] == 'X':
 56 |                     self.occupied.add((r, c))
 57 |                 elif maze[r, c] in {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}:
 58 |                     self.shape_ids[(r, c)] = len(self.shape_ids)
 59 |         
 60 |     def clone(self):
 61 |         return Shapes(self.maze, self.shape_rewards)
 62 | 
 63 |     def initialize(self):
 64 |         self.state = (random.choice(self.initial), tuple(0 for _ in range(len(self.shape_ids))))
 65 |         return self.state
 66 |     
 67 |     def action_count(self):
 68 |         return 4
 69 | 
 70 |     def transition(self, action): 
 71 |         (row, col), collected = self.state
 72 |         
 73 |         # perform the movement
 74 |         if action == Shapes.LEFT: 
 75 |             col -= 1
 76 |         elif action == Shapes.UP: 
 77 |             row -= 1
 78 |         elif action == Shapes.RIGHT: 
 79 |             col += 1
 80 |         elif action == Shapes.DOWN: 
 81 |             row += 1
 82 |         else:
 83 |             raise Exception('bad action {}'.format(action))
 84 |         
 85 |         # out of bounds, cannot move
 86 |         if col < 0 or col >= self.width or row < 0 or row >= self.height:
 87 |             return self.state, 0., False
 88 | 
 89 |         # into a blocked cell, cannot move
 90 |         s1 = (row, col)
 91 |         if s1 in self.occupied:
 92 |             return self.state, 0., False
 93 |         
 94 |         # can now move
 95 |         self.state = (s1, collected)
 96 |         
 97 |         # into a goal cell
 98 |         if s1 == self.goal:
 99 |             return self.state, 1., True
100 |         
101 |         # into a shape cell
102 |         if s1 in self.shape_ids:
103 |             shape_id = self.shape_ids[s1]
104 |             if collected[shape_id] == 1:
105 |                 
106 |                 # already collected this flag
107 |                 return self.state, 0., False
108 |             else:
109 |                 
110 |                 # collect the new flag
111 |                 collected = list(collected)
112 |                 collected[shape_id] = 1
113 |                 collected = tuple(collected)
114 |                 self.state = (s1, collected)
115 |                 reward = self.shape_rewards[self.maze[row, col]]
116 |                 return self.state, reward, False
117 |         
118 |         # into an empty cell
119 |         return self.state, 0., False
120 | 
121 |     # ===========================================================================
122 |     # STATE ENCODING FOR DEEP LEARNING
123 |     # ===========================================================================
124 |     def encode(self, state):
125 |         (y, x), coll = state
126 |         n_state = self.width + self.height
127 |         result = np.zeros((n_state + len(coll),))
128 |         result[y] = 1
129 |         result[self.height + x] = 1
130 |         result[n_state:] = np.array(coll)
131 |         result = result.reshape((1, -1))
132 |         return result
133 |     
134 |     def encode_dim(self):
135 |         return self.width + self.height + len(self.shape_ids)
136 |         
137 |     # ===========================================================================
138 |     # SUCCESSOR FEATURES
139 |     # ===========================================================================
140 |     def features(self, state, action, next_state):
141 |         s1, _ = next_state
142 |         _, collected = state
143 |         nc = len(self.all_shapes)
144 |         phi = np.zeros((nc + 1,))
145 |         if s1 in self.shape_ids:
146 |             if collected[self.shape_ids[s1]] != 1:
147 |                 y, x = s1
148 |                 shape_index = self.all_shapes[self.maze[y, x]]
149 |                 phi[shape_index] = 1.
150 |         elif s1 == self.goal:
151 |             phi[nc] = 1.
152 |         return phi
153 |     
154 |     def feature_dim(self):
155 |         return len(self.all_shapes) + 1
156 |     
157 |     def get_w(self):
158 |         ns = len(self.all_shapes)
159 |         w = np.zeros((ns + 1, 1))
160 |         for shape, shape_index in self.all_shapes.items():
161 |             w[shape_index, 0] = self.shape_rewards[shape]
162 |         w[ns, 0] = 1.
163 |         return w
164 |             
165 | 


--------------------------------------------------------------------------------
/source/agents/agent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import random
  3 | import numpy as np
  4 | 
  5 | 
  6 | class Agent:
  7 |     
  8 |     def __init__(self, gamma, T, encoding, *args, epsilon=0.1, epsilon_decay=1., epsilon_min=0.,
  9 |                  print_ev=1000, save_ev=100, **kwargs):
 10 |         """
 11 |         Creates a new abstract reinforcement learning agent.
 12 |         
 13 |         Parameters
 14 |         ----------
 15 |         gamma : float
 16 |             the discount factor in [0, 1]
 17 |         T : integer
 18 |             the maximum length of an episode
 19 |         encoding : function
 20 |             encodes the state of the task instance into a numpy array
 21 |         epsilon : float
 22 |             the initial exploration parameter for epsilon greedy (defaults to 0.1)
 23 |         epsilon_decay : float
 24 |             the amount to anneal epsilon in each time step (defaults to 1, no annealing)
 25 |         epsilon_min : float
 26 |             the minimum allowed value of epsilon (defaults to 0)
 27 |         print_ev : integer
 28 |             how often to print learning progress
 29 |         save_ev : 
 30 |             how often to save learning progress to internal memory        
 31 |         """
 32 |         self.gamma = gamma
 33 |         self.T = T
 34 |         if encoding is None:
 35 |             encoding = lambda s: s
 36 |         self.encoding = encoding
 37 |         self.epsilon_init = epsilon
 38 |         self.epsilon_decay = epsilon_decay
 39 |         self.epsilon_min = epsilon_min
 40 |         self.print_ev = print_ev
 41 |         self.save_ev = save_ev
 42 |         if len(args) != 0 or len(kwargs) != 0:
 43 |             print(self.__class__.__name__ + ' ignoring parameters ' + str(args) + ' and ' + str(kwargs))
 44 |         
 45 |     def get_Q_values(self, s, s_enc):
 46 |         """
 47 |         Returns the value function evaluated in the specified state. 
 48 |         An array of size [n_batch, n_actions], where:
 49 |             n_batch is the number of states provided
 50 |             n_actions is the number of possible actions in the current task
 51 |         
 52 |         Parameters
 53 |         ----------
 54 |         s : iterable of object
 55 |             raw states of the task
 56 |         s_enc : np.ndarray
 57 |             collection of encoded states of the shape [n_batch, -1]
 58 |         
 59 |         Returns
 60 |         -------
 61 |         np.ndarray : array of the shape [n_batch, n_actions] returning the estimated
 62 |         Q-values of the current instance
 63 |         """
 64 |         raise NotImplementedError
 65 |     
 66 |     def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma):
 67 |         """
 68 |         Trains the current agent on the provided transition.
 69 |         
 70 |         Parameters
 71 |         ----------
 72 |         s : object
 73 |             the raw state of the task
 74 |         s_enc : np.ndarray
 75 |             the encoded state of the task
 76 |         a : integer
 77 |             the action taken in state s
 78 |         r : float
 79 |             the reward obtained in the current transition
 80 |         s1 : object
 81 |             the raw successor state of the task
 82 |         s1_enc : np.ndarray
 83 |             the encoded next state s1 of the task
 84 |         gamma : float
 85 |             discount factor to apply to the transition - should be zero if a terminal transition
 86 |         """
 87 |         raise NotImplementedError
 88 |     
 89 |     # ===========================================================================
 90 |     # TASK MANAGEMENT
 91 |     # ===========================================================================
 92 |     def reset(self):
 93 |         """
 94 |         Resets the agent, including all value functions and internal memory/history.
 95 |         """
 96 |         self.tasks = []
 97 |         self.phis = []
 98 |         
 99 |         # reset counter history
100 |         self.cum_reward = 0.
101 |         self.reward_hist = []
102 |         self.cum_reward_hist = []
103 |     
104 |     def add_training_task(self, task):
105 |         """
106 |         Adds a training task to be trained by the agent.
107 |         """
108 |         self.tasks.append(task)   
109 |         self.n_tasks = len(self.tasks)  
110 |         self.phis.append(task.features)               
111 |         if self.n_tasks == 1:
112 |             self.n_actions = task.action_count()
113 |             self.n_features = task.feature_dim()
114 |             if self.encoding == 'task':
115 |                 self.encoding = task.encode
116 |     
117 |     def set_active_training_task(self, index):
118 |         """
119 |         Sets the task at the requested index as the current task the agent will train on.
120 |         The index is based on the order in which the training task was added to the agent.
121 |         """
122 |         
123 |         # set the task
124 |         self.task_index = index
125 |         self.active_task = self.tasks[index]
126 |         self.phi = self.phis[index]
127 |         
128 |         # reset task-dependent counters
129 |         self.s = self.s_enc = None
130 |         self.new_episode = True
131 |         self.episode, self.episode_reward = 0, 0.
132 |         self.steps_since_last_episode, self.reward_since_last_episode = 0, 0.
133 |         self.steps, self.reward = 0, 0.
134 |         self.epsilon = self.epsilon_init
135 |         self.episode_reward_hist = []
136 |         
137 |     # ===========================================================================
138 |     # TRAINING
139 |     # ===========================================================================
140 |     def _epsilon_greedy(self, q):
141 |         assert q.size == self.n_actions
142 |         
143 |         # sample from a Bernoulli distribution with parameter epsilon
144 |         if random.random() <= self.epsilon:
145 |             a = random.randrange(self.n_actions)
146 |         else:
147 |             a = np.argmax(q)
148 |         
149 |         # decrease the exploration gradually
150 |         self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
151 |         
152 |         return a
153 |     
154 |     def get_progress_strings(self):
155 |         """
156 |         Returns a string that displays the agent's learning progress. This includes things like
157 |         the current training task index, steps and episodes of training, exploration parameter,
158 |         the previous episode reward obtained and cumulative reward, and other information
159 |         depending on the current implementation.
160 |         """
161 |         sample_str = 'task \t {} \t steps \t {} \t episodes \t {} \t eps \t {:.4f}'.format(
162 |             self.task_index, self.steps, self.episode, self.epsilon)
163 |         reward_str = 'ep_reward \t {:.4f} \t reward \t {:.4f}'.format(
164 |             self.episode_reward, self.reward)
165 |         return sample_str, reward_str
166 |     
167 |     def next_sample(self, viewer=None, n_view_ev=None):
168 |         """
169 |         Updates the agent by performing one interaction with the current training environment.
170 |         This function performs all interactions with the environment, data and storage manipulations,
171 |         training the agent, and updating all history.
172 |         
173 |         Parameters
174 |         ----------
175 |         viewer : object
176 |             a viewer that displays the agent's exploration behavior on the task based on its update() method
177 |             (defaults to None)
178 |         n_view_ev : integer
179 |             how often (in training episodes) to invoke the viewer to display agent's learned behavior
180 |             (defaults to None)
181 |         """
182 |         
183 |         # start a new episode
184 |         if self.new_episode:
185 |             self.s = self.active_task.initialize()
186 |             self.s_enc = self.encoding(self.s)
187 |             self.new_episode = False
188 |             self.episode += 1
189 |             self.steps_since_last_episode = 0
190 |             self.episode_reward = self.reward_since_last_episode
191 |             self.reward_since_last_episode = 0.   
192 |             if self.episode > 1:
193 |                 self.episode_reward_hist.append(self.episode_reward)  
194 |         
195 |         # compute the Q-values in the current state
196 |         q = self.get_Q_values(self.s, self.s_enc)
197 |         
198 |         # choose an action using the epsilon-greedy policy
199 |         a = self._epsilon_greedy(q)
200 |         
201 |         # take action a and observe reward r and next state s'
202 |         s1, r, terminal = self.active_task.transition(a)
203 |         s1_enc = self.encoding(s1)
204 |         if terminal:
205 |             gamma = 0.
206 |             self.new_episode = True
207 |         else:
208 |             gamma = self.gamma
209 |         
210 |         # train the agent
211 |         self.train_agent(self.s, self.s_enc, a, r, s1, s1_enc, gamma)
212 |         
213 |         # update counters
214 |         self.s, self.s_enc = s1, s1_enc
215 |         self.steps += 1
216 |         self.reward += r
217 |         self.steps_since_last_episode += 1
218 |         self.reward_since_last_episode += r
219 |         self.cum_reward += r
220 |         
221 |         if self.steps_since_last_episode >= self.T:
222 |             self.new_episode = True
223 |             
224 |         if self.steps % self.save_ev == 0:
225 |             self.reward_hist.append(self.reward)
226 |             self.cum_reward_hist.append(self.cum_reward)
227 |         
228 |         # viewing
229 |         if viewer is not None and self.episode % n_view_ev == 0:
230 |             viewer.update()
231 |         
232 |         # printing
233 |         if self.steps % self.print_ev == 0:
234 |             print('\t'.join(self.get_progress_strings()))
235 |     
236 |     def train_on_task(self, train_task, n_samples, viewer=None, n_view_ev=None):
237 |         """
238 |         Trains the agent on the current task.
239 |         
240 |         Parameters
241 |         ----------
242 |         train_task : Task
243 |             the training task instance
244 |         n_samples : integer
245 |             how many samples should be generated and used to train the agent
246 |         viewer : object
247 |             a viewer that displays the agent's exploration behavior on the task based on its update() method
248 |             (defaults to None)
249 |         n_view_ev : integer
250 |             how often (in training episodes) to invoke the viewer to display agent's learned behavior
251 |             (defaults to None)
252 |         """
253 |         self.add_training_task(train_task)
254 |         self.set_active_training_task(self.n_tasks - 1)
255 |         for _ in range(n_samples):
256 |             self.next_sample(viewer, n_view_ev)
257 |             
258 |     def train(self, train_tasks, n_samples, viewers=None, n_view_ev=None):
259 |         """
260 |         Trains the agent on a set of tasks.
261 |         
262 |         Parameters
263 |         ----------
264 |         train_tasks : iterable of Task
265 |             the training task instances
266 |         n_samples : integer
267 |             how many samples should be generated and used to train the agent on each task
268 |         viewer : iterable object
269 |             viewers that display the agent's exploration behavior on each task based on their update() methods
270 |             (defaults to None)
271 |         n_view_ev : integer
272 |             how often (in training episodes) to invoke the viewer to display agent's learned behavior
273 |             (defaults to None)
274 |         """
275 |         if viewers is None: 
276 |             viewers = [None] * len(train_tasks)
277 |         self.reset()
278 |         for train_task, viewer in zip(train_tasks, viewers):
279 |             self.train_on_task(train_task, n_samples, viewer, n_view_ev)
280 |     
281 | 


--------------------------------------------------------------------------------
/source/features/successor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import numpy as np
  3 | 
  4 | 
  5 | class SF:
  6 |     
  7 |     def __init__(self, learning_rate_w, *args, use_true_reward=False, **kwargs):
  8 |         """
  9 |         Creates a new abstract successor feature representation.
 10 |         
 11 |         Parameters
 12 |         ----------
 13 |         learning_rate_w : float
 14 |             the learning rate to use for learning the reward weights using gradient descent
 15 |         use_true_reward : boolean
 16 |             whether or not to use the true reward weights from the environment, or learn them
 17 |             using gradient descent
 18 |         """
 19 |         self.alpha_w = learning_rate_w
 20 |         self.use_true_reward = use_true_reward
 21 |         if len(args) != 0 or len(kwargs) != 0:
 22 |             print(self.__class__.__name__ + ' ignoring parameters ' + str(args) + ' and ' + str(kwargs))
 23 |             
 24 |     def build_successor(self, task, source=None):
 25 |         """
 26 |         Builds a new successor feature map for the specified task. This method should not be called directly.
 27 |         Instead, add_task should be called instead.
 28 |         
 29 |         Parameters
 30 |         ----------
 31 |         task : Task
 32 |             a new MDP environment for which to learn successor features
 33 |         source : integer
 34 |             if specified and not None, the parameters of the successor features for the task at the source
 35 |             index should be copied to the new successor features, as suggested in [1]
 36 |             
 37 |         Returns
 38 |         -------
 39 |         object : the successor feature representation for the new task, which can be a Keras model, 
 40 |         a lookup table (dictionary) or another learning representation
 41 |         """
 42 |         raise NotImplementedError
 43 |         
 44 |     def get_successor(self, state, policy_index):
 45 |         """
 46 |         Evaluates the successor features in given states for the specified task.
 47 |         
 48 |         Parameters
 49 |         ----------
 50 |         state : object
 51 |             a state or collection of states of the MDP
 52 |         policy_index : integer
 53 |             the index of the task whose successor features to evaluate
 54 |         
 55 |         Returns
 56 |         -------
 57 |         np.ndarray : the evaluation of the successor features, which is of shape
 58 |         [n_batch, n_actions, n_features], where
 59 |             n_batch is the number of states in the state argument
 60 |             n_actions is the number of actions of the MDP
 61 |             n_features is the number of features in the SF representation
 62 |         """
 63 |         raise NotImplementedError
 64 |     
 65 |     def get_successors(self, state):
 66 |         """
 67 |         Evaluates the successor features in given states for all tasks.
 68 |         
 69 |         Parameters
 70 |         ----------
 71 |         state : object
 72 |             a state or collection of states of the MDP
 73 |         
 74 |         Returns
 75 |         -------
 76 |         np.ndarray : the evaluation of the successor features, which is of shape
 77 |         [n_batch, n_tasks, n_actions, n_features], where
 78 |             n_batch is the number of states in the state argument
 79 |             n_tasks is the number of tasks
 80 |             n_actions is the number of actions of the MDP
 81 |             n_features is the number of features in the SF representation
 82 |         """
 83 |         raise NotImplementedError
 84 |     
 85 |     def update_successor(self, transitions, policy_index):
 86 |         """
 87 |         Updates the successor representation by training it on the given transition.
 88 |         
 89 |         Parameters
 90 |         ----------
 91 |         transitions : object
 92 |             collection of transitions
 93 |         policy_index : integer
 94 |             the index of the task whose successor features to update
 95 |         """
 96 |         raise NotImplementedError
 97 |         
 98 |     def reset(self):
 99 |         """
100 |         Removes all trained successor feature representations from the current object, all learned rewards,
101 |         and all task information.
102 |         """
103 |         self.n_tasks = 0
104 |         self.psi = []
105 |         self.true_w = []
106 |         self.fit_w = []
107 |         self.gpi_counters = []
108 | 
109 |     def add_training_task(self, task, source=None):
110 |         """
111 |         Adds a successor feature representation for the specified task.
112 |         
113 |         Parameters
114 |         ----------
115 |         task : Task
116 |             a new MDP environment for which to learn successor features
117 |         source : integer
118 |             if specified and not None, the parameters of the successor features for the task at the source
119 |             index should be copied to the new successor features, as suggested in [1]
120 |         """
121 |         
122 |         # add successor features to the library
123 |         self.psi.append(self.build_successor(task, source))
124 |         self.n_tasks = len(self.psi)
125 |         
126 |         # build new reward function
127 |         true_w = task.get_w()
128 |         self.true_w.append(true_w)
129 |         if self.use_true_reward:
130 |             fit_w = true_w
131 |         else:
132 |             n_features = task.feature_dim()
133 |             fit_w = np.random.uniform(low=-0.01, high=0.01, size=(n_features, 1))
134 |         self.fit_w.append(fit_w)
135 |         
136 |         # add statistics
137 |         for i in range(len(self.gpi_counters)):
138 |             self.gpi_counters[i] = np.append(self.gpi_counters[i], 0)
139 |         self.gpi_counters.append(np.zeros((self.n_tasks,), dtype=int))
140 |         
141 |     def update_reward(self, phi, r, task_index, exact=False):
142 |         """
143 |         Updates the reward parameters for the given task based on the observed reward sample
144 |         from the environment. 
145 |         
146 |         Parameters
147 |         ----------
148 |         phi : np.ndarray
149 |             the state features
150 |         r : float
151 |             the observed reward from the MDP
152 |         task_index : integer
153 |             the index of the task from which this reward was sampled
154 |         exact : boolean
155 |             if True, validates the true reward from the environment and the linear representation
156 |         """
157 |         
158 |         # update reward using linear regression
159 |         w = self.fit_w[task_index]
160 |         phi = phi.reshape(w.shape)
161 |         r_fit = np.sum(phi * w)
162 |         self.fit_w[task_index] = w + self.alpha_w * (r - r_fit) * phi
163 |     
164 |         # validate reward
165 |         r_true = np.sum(phi * self.true_w[task_index])
166 |         if exact and not np.allclose(r, r_true):
167 |             raise Exception('sampled reward {} != linear reward {} - please check task {}!'.format(
168 |                 r, r_true, task_index))
169 |     
170 |     def GPE_w(self, state, policy_index, w):
171 |         """
172 |         Implements generalized policy evaluation according to [1]. In summary, this uses the
173 |         learned reward parameters of one task and successor features of a policy to estimate the Q-values of 
174 |         the policy if it were executed in that task.
175 |         
176 |         Parameters
177 |         ----------
178 |         state : object
179 |             a state or collection of states of the MDP
180 |         policy_index : integer
181 |             the index of the task whose policy to evaluate
182 |         w : numpy array
183 |             reward parameters of the task in which to evaluate the policy
184 |             
185 |         Returns
186 |         -------
187 |         np.ndarray : the estimated Q-values of shape [n_batch, n_actions], where
188 |             n_batch is the number of states in the state argument
189 |             n_actions is the number of actions in the MDP            
190 |         """
191 |         psi = self.get_successor(state, policy_index)
192 |         q = psi @ w  # shape (n_batch, n_actions)
193 |         return q
194 |         
195 |     def GPE(self, state, policy_index, task_index):
196 |         """
197 |         Implements generalized policy evaluation according to [1]. In summary, this uses the
198 |         learned reward parameters of one task and successor features of a policy to estimate the Q-values of 
199 |         the policy if it were executed in that task.
200 |         
201 |         Parameters
202 |         ----------
203 |         state : object
204 |             a state or collection of states of the MDP
205 |         policy_index : integer
206 |             the index of the task whose policy to evaluate
207 |         task_index : integer
208 |             the index of the task (e.g. reward) to use to evaluate the policy
209 |             
210 |         Returns
211 |         -------
212 |         np.ndarray : the estimated Q-values of shpae [n_batch, n_actions], where
213 |             n_batch is the number of states in the state argument
214 |             n_actions is the number of actions in the MDP            
215 |         """
216 |         return self.GPE_w(state, policy_index, self.fit_w[task_index])
217 |     
218 |     def GPI_w(self, state, w):
219 |         """
220 |         Implements generalized policy improvement according to [1]. 
221 |         
222 |         Parameters
223 |         ----------
224 |         state : object
225 |             a state or collection of states of the MDP
226 |         w : numpy array
227 |             the reward parameters of the task to control
228 |         
229 |         Returns
230 |         -------
231 |         np.ndarray : the maximum Q-values computed by GPI for selecting actions
232 |         of shape [n_batch, n_tasks, n_actions], where:
233 |             n_batch is the number of states in the state argument
234 |             n_tasks is the number of tasks
235 |             n_actions is the number of actions in the MDP 
236 |         np.ndarray : the tasks that are active in each state of state_batch in GPi
237 |         """
238 |         psi = self.get_successors(state)
239 |         q = (psi @ w)[:,:,:, 0]  # shape (n_batch, n_tasks, n_actions)
240 |         task = np.squeeze(np.argmax(np.max(q, axis=2), axis=1))  # shape (n_batch,)
241 |         return q, task
242 |     
243 |     def GPI(self, state, task_index, update_counters=False):
244 |         """
245 |         Implements generalized policy improvement according to [1]. 
246 |         
247 |         Parameters
248 |         ----------
249 |         state : object
250 |             a state or collection of states of the MDP
251 |         task_index : integer
252 |             the index of the task in which the GPI action will be used
253 |         update_counters : boolean
254 |             whether or not to keep track of which policies are active in GPI
255 |         
256 |         Returns
257 |         -------
258 |         np.ndarray : the maximum Q-values computed by GPI for selecting actions
259 |         of shape [n_batch, n_tasks, n_actions], where:
260 |             n_batch is the number of states in the state argument
261 |             n_tasks is the number of tasks
262 |             n_actions is the number of actions in the MDP 
263 |         np.ndarray : the tasks that are active in each state of state_batch in GPi
264 |         """
265 |         q, task = self.GPI_w(state, self.fit_w[task_index])
266 |         if update_counters:
267 |             self.gpi_counters[task_index][task] += 1
268 |         return q, task
269 |     
270 |     def GPI_usage_percent(self, task_index):
271 |         """
272 |         Counts the number of times that actions were transferred from other tasks.
273 |         
274 |         Parameters
275 |         ----------
276 |         task_index : integer
277 |             the index of the task
278 |         
279 |         Returns
280 |         -------
281 |         float : the (normalized) number of actions that were transferred from other
282 |             tasks in GPi.
283 |         """
284 |         counts = self.gpi_counters[task_index]        
285 |         return 1. - (float(counts[task_index]) / np.sum(counts))
286 | 


--------------------------------------------------------------------------------