├── source ├── agents │ ├── __init__.py │ ├── ql.py │ ├── sfql.py │ ├── buffer.py │ ├── sfdqn.py │ ├── dqn.py │ └── agent.py ├── features │ ├── __init__.py │ ├── tabular.py │ ├── deep.py │ └── successor.py ├── tasks │ ├── __init__.py │ ├── task.py │ ├── reacher.py │ └── gridworld.py ├── utils │ ├── __init__.py │ ├── config.py │ └── stats.py ├── figures │ ├── sfql_return.png │ └── sfdqn_return.png ├── configs │ ├── reacher.cfg │ └── gridworld.cfg ├── main_sfql.py └── main_sfdqn.py ├── .settings └── org.eclipse.core.resources.prefs ├── LICENSE ├── README.md └── .gitignore /source/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /source/figures/sfql_return.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mike-gimelfarb/deep-successor-features-for-transfer/HEAD/source/figures/sfql_return.png -------------------------------------------------------------------------------- /source/figures/sfdqn_return.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mike-gimelfarb/deep-successor-features-for-transfer/HEAD/source/figures/sfdqn_return.png -------------------------------------------------------------------------------- /source/utils/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from ast import literal_eval 3 | from collections import defaultdict 4 | import configparser 5 | import os 6 | from pathlib import Path 7 | 8 | 9 | def parse_config_file(name): 10 | raw_path = Path(__file__).parent.parent 11 | config_path = os.path.join(raw_path, 'configs', name) 12 | config = configparser.RawConfigParser() 13 | config.optionxform = str 14 | config.read(config_path) 15 | section_dict = defaultdict() 16 | for section in config.sections(): 17 | section_dict[section] = {k: literal_eval(v) for k, v in config.items(section)} 18 | return section_dict 19 | -------------------------------------------------------------------------------- /source/utils/stats.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | 4 | 5 | class OnlineMeanVariance: 6 | 7 | def __init__(self): 8 | self.count = 0 9 | self.mean = 0. 10 | self.M2 = 0. 11 | 12 | def update(self, x): 13 | x = np.array(x) 14 | self.count += 1 15 | delta = x - self.mean 16 | self.mean = self.mean + delta / float(self.count) 17 | delta2 = x - self.mean 18 | self.M2 = self.M2 + delta * delta2 19 | 20 | def calculate_variance(self): 21 | return self.M2 / (self.count - 1.) 22 | 23 | def calculate_standard_error(self): 24 | return np.sqrt(self.calculate_variance() / float(self.count)) 25 | 26 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//source/agents/agent.py=UTF-8 3 | encoding//source/agents/buffer.py=UTF-8 4 | encoding//source/agents/dqn.py=UTF-8 5 | encoding//source/agents/ql.py=UTF-8 6 | encoding//source/agents/sfdqn.py=UTF-8 7 | encoding//source/agents/sfql.py=UTF-8 8 | encoding//source/features/deep.py=UTF-8 9 | encoding//source/features/successor.py=UTF-8 10 | encoding//source/features/tabular.py=UTF-8 11 | encoding//source/main_sfdqn.py=UTF-8 12 | encoding//source/main_sfql.py=UTF-8 13 | encoding//source/tasks/gridworld.py=UTF-8 14 | encoding//source/tasks/reacher.py=UTF-8 15 | encoding//source/tasks/task.py=UTF-8 16 | encoding//source/utils/config.py=UTF-8 17 | encoding//source/utils/stats.py=UTF-8 18 | -------------------------------------------------------------------------------- /source/configs/reacher.cfg: -------------------------------------------------------------------------------- 1 | [GENERAL] 2 | n_samples=100000 3 | 4 | [TASK] 5 | train_targets=[(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14)] 6 | test_targets=[(0.22, 0.0), (-0.22, 0.0), (0.0, 0.22), (0.0, -0.22), (0.1, 0.1), (0.1, -0.1), (-0.1, 0.1), (-0.1, -0.1)] 7 | 8 | [AGENT] 9 | gamma=0.9 10 | epsilon=0.1 11 | test_epsilon=0.03 12 | T=500 13 | print_ev=1000 14 | save_ev=200 15 | n_test_ev=1000 16 | encoding="task" 17 | 18 | [DQN] 19 | target_update_ev=1000 20 | keras_params={ 21 | "n_neurons" : [256, 256], 22 | "activations" : ["relu", "relu"], 23 | "learning_rate" : 0.001} 24 | buffer_params={ 25 | "n_samples" : 1000000, 26 | "n_batch" : 32} 27 | 28 | [SFDQN] 29 | learning_rate_w=0.5 30 | use_true_reward=True 31 | use_gpi=True 32 | target_update_ev=1000 33 | keras_params={ 34 | "n_neurons" : [256, 256], 35 | "activations" : ["relu", "relu"], 36 | "learning_rate" : 0.001} 37 | buffer_params={ 38 | "n_samples" : 1000000, 39 | "n_batch" : 32} -------------------------------------------------------------------------------- /source/agents/ql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from collections import defaultdict 3 | import numpy as np 4 | 5 | from agents.agent import Agent 6 | 7 | 8 | class QL(Agent): 9 | 10 | def __init__(self, learning_rate, *args, **kwargs): 11 | """ 12 | Creates a new tabular Q-learning agent. 13 | 14 | Parameters 15 | ---------- 16 | learning_rate : float 17 | the learning rate to use in order to update Q-values 18 | """ 19 | super(QL, self).__init__(*args, **kwargs) 20 | self.alpha = learning_rate 21 | 22 | def get_Q_values(self, s, s_enc): 23 | return self.Q[s] 24 | 25 | def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma): 26 | target = r + gamma * np.max(self.Q[s1]) 27 | error = target - self.Q[s][a] 28 | self.Q[s][a] += self.alpha * error 29 | 30 | def set_active_training_task(self, index): 31 | super(QL, self).set_active_training_task(index) 32 | self.Q = defaultdict(lambda: np.random.uniform(low=-0.01, high=0.01, size=(self.n_actions,))) 33 | 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Successor Features Framework: Copyright (c) 2021 Mike Gimelfarb 4 | Wrappers to the PyBullet-Gym Package: Copyright (c) 2018 Benjamin Ellenberger 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /source/configs/gridworld.cfg: -------------------------------------------------------------------------------- 1 | [GENERAL] 2 | n_samples=20000 3 | n_tasks=20 4 | n_trials=20 5 | 6 | [TASK] 7 | maze=[ 8 | ['1', ' ', ' ', ' ', ' ', '2', 'X', ' ', ' ', ' ', ' ', ' ', 'G'], 9 | [' ', ' ', ' ', ' ', ' ', ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' '], 10 | [' ', ' ', ' ', ' ', ' ', ' ', '1', ' ', ' ', ' ', ' ', ' ', ' '], 11 | [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '], 12 | [' ', ' ', ' ', ' ', ' ', ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' '], 13 | ['2', ' ', ' ', ' ', ' ', '3', 'X', ' ', ' ', ' ', ' ', ' ', ' '], 14 | ['X', 'X', '3', ' ', 'X', 'X', 'X', 'X', 'X', ' ', '1', 'X', 'X'], 15 | [' ', ' ', ' ', ' ', ' ', ' ', 'X', '2', ' ', ' ', ' ', ' ', '3'], 16 | [' ', ' ', ' ', ' ', ' ', ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' '], 17 | [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '], 18 | [' ', ' ', ' ', ' ', ' ', ' ', '2', ' ', ' ', ' ', ' ', ' ', ' '], 19 | [' ', ' ', ' ', ' ', ' ', ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' '], 20 | ['_', ' ', ' ', ' ', ' ', ' ', 'X', '3', ' ', ' ', ' ', ' ', '1']] 21 | 22 | [AGENT] 23 | gamma=0.95 24 | epsilon=0.15 25 | T=200 26 | print_ev=2000 27 | save_ev=200 28 | encoding=None 29 | 30 | [SFQL] 31 | learning_rate=0.5 32 | learning_rate_w=0.5 33 | use_true_reward=False 34 | 35 | [QL] 36 | learning_rate=0.5 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # successor-features-for-transfer 2 | A reusable framework and independent implementation for successor features (SF) for transfer in (deep) reinforcement learning using keras, based on [1]. 3 | 4 | Discrete four-room domain: 5 | 6 | ![](https://github.com/mike-gimelfarb/successor-features-for-transfer/blob/main/source/figures/sfql_return.png) 7 | 8 | Deep learning for reacher domain (MuJoCo): 9 | 10 | ![](https://github.com/mike-gimelfarb/successor-features-for-transfer/blob/main/source/figures/sfdqn_return.png) 11 | 12 | Currently supports: 13 | - tabular SF representations for discrete environments, based on an efficient hash table representation 14 | - deep neural network SF representations for large or continuous-state environments, based on keras; allows existing keras models or custom architectures (e.g. CNNs) as inputs for easy training and tuning 15 | - tasks with pre-defined state features only, although support for training features on-the-fly may be added later 16 | - tasks structured according to the OpenAI gym framework 17 | 18 | # Requirements 19 | - python 3.8 or later 20 | - tensorflow 2.3 or later 21 | - pybullet 3.0.8 and pybullet-gym 0.1 (for reacher domain) 22 | 23 | # References 24 | [1] Barreto, André, et al. "Successor features for transfer in reinforcement learning." Advances in neural information processing systems. 2017. 25 | [2] Dayan, Peter. "Improving generalization for temporal difference learning: The successor representation." Neural Computation 5.4 (1993): 613-624. 26 | -------------------------------------------------------------------------------- /source/features/tabular.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from collections import defaultdict 3 | from copy import deepcopy 4 | import numpy as np 5 | 6 | from features.successor import SF 7 | 8 | 9 | class TabularSF(SF): 10 | """ 11 | A successor feature representation implemented using lookup tables. Storage is lazy and implemented efficiently 12 | using defaultdict. 13 | """ 14 | 15 | def __init__(self, learning_rate, *args, 16 | noise_init=lambda size: np.random.uniform(-0.01, 0.01, size=size), **kwargs): 17 | """ 18 | Creates a new tabular representation of successor features. 19 | 20 | Parameters 21 | ---------- 22 | learning_rate : float 23 | the learning rate 24 | noise_init : function 25 | instruction to initialize action-values, defaults to Uniform[-0.01, 0.01] 26 | """ 27 | super(TabularSF, self).__init__(*args, **kwargs) 28 | self.alpha = learning_rate 29 | self.noise_init = noise_init 30 | 31 | def build_successor(self, task, source=None): 32 | if source is None or len(self.psi) == 0: 33 | n_actions = task.action_count() 34 | n_features = task.feature_dim() 35 | return defaultdict(lambda: self.noise_init((n_actions, n_features))) 36 | else: 37 | return deepcopy(self.psi[source]) 38 | 39 | def get_successor(self, state, policy_index): 40 | return np.expand_dims(self.psi[policy_index][state], axis=0) 41 | 42 | def get_successors(self, state): 43 | return np.expand_dims(np.array([psi[state] for psi in self.psi]), axis=0) 44 | 45 | def update_successor(self, transitions, policy_index): 46 | for state, action, phi, next_state, next_action, gamma in transitions: 47 | psi = self.psi[policy_index] 48 | targets = phi.flatten() + gamma * psi[next_state][next_action,:] 49 | errors = targets - psi[state][action,:] 50 | psi[state][action,:] = psi[state][action,:] + self.alpha * errors 51 | 52 | -------------------------------------------------------------------------------- /source/agents/sfql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | 4 | from agents.agent import Agent 5 | 6 | 7 | class SFQL(Agent): 8 | 9 | def __init__(self, lookup_table, *args, use_gpi=True, **kwargs): 10 | """ 11 | Creates a new tabular successor feature agent. 12 | 13 | Parameters 14 | ---------- 15 | lookup_table : TabularSF 16 | a tabular successor feature representation 17 | use_gpi : boolean 18 | whether or not to use transfer learning (defaults to True) 19 | """ 20 | super(SFQL, self).__init__(*args, **kwargs) 21 | self.sf = lookup_table 22 | self.use_gpi = use_gpi 23 | 24 | def get_Q_values(self, s, s_enc): 25 | q, self.c = self.sf.GPI(s_enc, self.task_index, update_counters=self.use_gpi) 26 | if not self.use_gpi: 27 | self.c = self.task_index 28 | return q[:, self.c,:] 29 | 30 | def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma): 31 | 32 | # update w 33 | t = self.task_index 34 | phi = self.phi(s, a, s1) 35 | self.sf.update_reward(phi, r, t) 36 | 37 | # update SF for the current task t 38 | if self.use_gpi: 39 | q1, _ = self.sf.GPI(s1_enc, t) 40 | q1 = np.max(q1[0,:,:], axis=0) 41 | else: 42 | q1 = self.sf.GPE(s1_enc, t, t)[0,:] 43 | next_action = np.argmax(q1) 44 | transitions = [(s_enc, a, phi, s1_enc, next_action, gamma)] 45 | self.sf.update_successor(transitions, t) 46 | 47 | # update SF for source task c 48 | if self.c != t: 49 | q1 = self.sf.GPE(s1_enc, self.c, self.c) 50 | next_action = np.argmax(q1) 51 | transitions = [(s_enc, a, phi, s1_enc, next_action, gamma)] 52 | self.sf.update_successor(transitions, self.c) 53 | 54 | def reset(self): 55 | super(SFQL, self).reset() 56 | self.sf.reset() 57 | 58 | def add_training_task(self, task): 59 | super(SFQL, self).add_training_task(task) 60 | self.sf.add_training_task(task, -1) 61 | 62 | def get_progress_strings(self): 63 | sample_str, reward_str = super(SFQL, self).get_progress_strings() 64 | gpi_percent = self.sf.GPI_usage_percent(self.task_index) 65 | w_error = np.linalg.norm(self.sf.fit_w[self.task_index] - self.sf.true_w[self.task_index]) 66 | gpi_str = 'GPI% \t {:.4f} \t w_err \t {:.4f}'.format(gpi_percent, w_error) 67 | return sample_str, reward_str, gpi_str 68 | 69 | -------------------------------------------------------------------------------- /source/main_sfql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | from agents.sfql import SFQL 6 | from agents.ql import QL 7 | from features.tabular import TabularSF 8 | from tasks.gridworld import Shapes 9 | from utils.config import parse_config_file 10 | from utils.stats import OnlineMeanVariance 11 | 12 | # general training params 13 | config_params = parse_config_file('gridworld.cfg') 14 | gen_params = config_params['GENERAL'] 15 | task_params = config_params['TASK'] 16 | agent_params = config_params['AGENT'] 17 | sfql_params = config_params['SFQL'] 18 | ql_params = config_params['QL'] 19 | 20 | 21 | # tasks 22 | def generate_task(): 23 | rewards = dict(zip(['1', '2', '3'], list(np.random.uniform(low=-1.0, high=1.0, size=3)))) 24 | return Shapes(maze=np.array(task_params['maze']), shape_rewards=rewards) 25 | 26 | 27 | # agents 28 | sfql = SFQL(TabularSF(**sfql_params), **agent_params) 29 | ql = QL(**agent_params, **ql_params) 30 | agents = [sfql, ql] 31 | names = ['SFQL', 'QLearning'] 32 | 33 | # train 34 | data_task_return = [OnlineMeanVariance() for _ in agents] 35 | n_trials = gen_params['n_trials'] 36 | n_samples = gen_params['n_samples'] 37 | n_tasks = gen_params['n_tasks'] 38 | for trial in range(n_trials): 39 | 40 | # train each agent on a set of tasks 41 | for agent in agents: 42 | agent.reset() 43 | for t in range(n_tasks): 44 | task = generate_task() 45 | for agent, name in zip(agents, names): 46 | print('\ntrial {}, solving with {}'.format(trial, name)) 47 | agent.train_on_task(task, n_samples) 48 | 49 | # update performance statistics 50 | for i, agent in enumerate(agents): 51 | data_task_return[i].update(agent.reward_hist) 52 | 53 | # plot the task return 54 | ticksize = 14 55 | textsize = 18 56 | figsize = (20, 10) 57 | 58 | plt.rc('font', size=textsize) # controls default text sizes 59 | plt.rc('axes', titlesize=textsize) # fontsize of the axes title 60 | plt.rc('axes', labelsize=textsize) # fontsize of the x and y labels 61 | plt.rc('xtick', labelsize=ticksize) # fontsize of the tick labels 62 | plt.rc('ytick', labelsize=ticksize) # fontsize of the tick labels 63 | plt.rc('legend', fontsize=ticksize) # legend fontsize 64 | 65 | plt.figure(figsize=(12, 6)) 66 | ax = plt.gca() 67 | for i, name in enumerate(names): 68 | mean = data_task_return[i].mean 69 | n_sample_per_tick = n_samples * n_tasks // mean.size 70 | x = np.arange(mean.size) * n_sample_per_tick 71 | se = data_task_return[i].calculate_standard_error() 72 | plt.plot(x, mean, label=name) 73 | ax.fill_between(x, mean - se, mean + se, alpha=0.3) 74 | plt.xlabel('sample') 75 | plt.ylabel('cumulative reward') 76 | plt.title('Cumulative Training Reward Per Task') 77 | plt.tight_layout() 78 | plt.legend(ncol=2, frameon=False) 79 | plt.savefig('figures/sfql_return.png') 80 | -------------------------------------------------------------------------------- /source/agents/buffer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | 4 | 5 | class ReplayBuffer: 6 | 7 | def __init__(self, *args, n_samples=1000000, n_batch=32, **kwargs): 8 | """ 9 | Creates a new randomized replay buffer. 10 | 11 | Parameters 12 | ---------- 13 | n_samples : integer 14 | the maximum number of samples that can be stored in the buffer 15 | n_batch : integer 16 | the batch size 17 | """ 18 | self.n_samples = n_samples 19 | self.n_batch = n_batch 20 | 21 | def reset(self): 22 | """ 23 | Removes all samples currently stored in the buffer. 24 | """ 25 | self.buffer = np.empty(self.n_samples, dtype=object) 26 | self.index = 0 27 | self.size = 0 28 | 29 | def replay(self): 30 | """ 31 | Samples a batch of samples from the buffer randomly. If the number of samples 32 | currently in the buffer is less than the batch size, returns None. 33 | 34 | Returns 35 | ------- 36 | states : np.ndarray 37 | a collection of starting states of shape [n_batch, -1] 38 | actions : np.ndarray 39 | a collection of actions taken in the starting states of shape [n_batch,] 40 | rewards : np.ndarray: 41 | a collection of rewards (for DQN) or features (for SFDQN) obtained of shape [n_batch, -1] 42 | next_states : np.ndarray 43 | a collection of successor states of shape [n_batch, -1] 44 | gammas : np.ndarray 45 | a collection of discount factors to be applied in computing targets for training of shape [n_batch,] 46 | """ 47 | if self.size < self.n_batch: return None 48 | indices = np.random.randint(low=0, high=self.size, size=(self.n_batch,)) 49 | states, actions, rewards, next_states, gammas = zip(*self.buffer[indices]) 50 | states = np.vstack(states) 51 | actions = np.array(actions) 52 | rewards = np.vstack(rewards) 53 | next_states = np.vstack(next_states) 54 | gammas = np.array(gammas) 55 | return states, actions, rewards, next_states, gammas 56 | 57 | def append(self, state, action, reward, next_state, gamma): 58 | """ 59 | Adds the specified sample to the replay buffer. If the buffer is full, then the earliest added 60 | sample is removed, and the new sample is added. 61 | 62 | Parameters 63 | ---------- 64 | state : np.ndarray 65 | the encoded state of the task 66 | action : integer 67 | the action taken in state 68 | reward : float or np.ndarray 69 | the reward obtained in the current transition (for DQN) or state features (for SFDQN) 70 | next_state : np.ndarray 71 | the encoded successor state 72 | gamma : floag 73 | the effective discount factor to be applied in computing targets for training 74 | """ 75 | self.buffer[self.index] = (state, action, reward, next_state, gamma) 76 | self.size = min(self.size + 1, self.n_samples) 77 | self.index = (self.index + 1) % self.n_samples 78 | 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .metadata 132 | bin/ 133 | tmp/ 134 | *.tmp 135 | *.bak 136 | *.swp 137 | *~.nib 138 | local.properties 139 | .settings/ 140 | .loadpath 141 | .recommenders 142 | 143 | # External tool builders 144 | .externalToolBuilders/ 145 | 146 | # Locally stored "Eclipse launch configurations" 147 | *.launch 148 | 149 | # PyDev specific (Python IDE for Eclipse) 150 | *.pydevproject 151 | 152 | # CDT-specific (C/C++ Development Tooling) 153 | .cproject 154 | 155 | # CDT- autotools 156 | .autotools 157 | 158 | # Java annotation processor (APT) 159 | .factorypath 160 | 161 | # PDT-specific (PHP Development Tools) 162 | .buildpath 163 | 164 | # sbteclipse plugin 165 | .target 166 | 167 | # Tern plugin 168 | .tern-project 169 | 170 | # TeXlipse plugin 171 | .texlipse 172 | 173 | # STS (Spring Tool Suite) 174 | .springBeans 175 | 176 | # Code Recommenders 177 | .recommenders/ 178 | 179 | # Annotation Processing 180 | .apt_generated/ 181 | .apt_generated_test/ 182 | 183 | # Scala IDE specific (Scala & Java development for Eclipse) 184 | .cache-main 185 | .scala_dependencies 186 | .worksheet 187 | 188 | # Uncomment this line if you wish to ignore the project description file. 189 | # Typically, this file would be tracked if it contains build/dependency configurations: 190 | #.project 191 | -------------------------------------------------------------------------------- /source/tasks/task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | class Task: 3 | """ 4 | An abstract representation of an MDP with arbitrary state space and finite action space. 5 | """ 6 | 7 | def clone(self): 8 | """ 9 | Creates an identical copy of the current environment, for use in testing. 10 | 11 | Returns 12 | ------- 13 | Task : the copy of the current task 14 | """ 15 | raise NotImplementedError 16 | 17 | def initialize(self): 18 | """ 19 | Resets the state of the environment. 20 | 21 | Returns 22 | ------- 23 | object : the initial state of the MDP 24 | """ 25 | raise NotImplementedError 26 | 27 | def action_count(self): 28 | """ 29 | Returns the number of possible actions in the MDP. 30 | 31 | Returns 32 | ------- 33 | integer : number of possible actions 34 | """ 35 | raise NotImplementedError 36 | 37 | def transition(self, action): 38 | """ 39 | Applies the specified action in the environment, updating the state of the MDP. 40 | 41 | Parameters 42 | ---------- 43 | action : integer 44 | the action to apply to the environment 45 | 46 | Returns 47 | ------- 48 | object : the next state of the MDP 49 | float : the immediate reward observed in the transition 50 | boolean : whether or not a terminal state has been reached 51 | """ 52 | raise NotImplementedError 53 | 54 | # =========================================================================== 55 | # STATE ENCODING FOR DEEP LEARNING 56 | # =========================================================================== 57 | def encode(self, state): 58 | """ 59 | Encodes the state of the MDP according to its canonical encoding. 60 | 61 | Parameters 62 | ---------- 63 | state : object 64 | the state of the MDP to encode 65 | 66 | Returns 67 | ------- 68 | np.ndarray : the encoding of the state 69 | """ 70 | raise NotImplementedError 71 | 72 | def encode_dim(self): 73 | """ 74 | Returns the dimension of the canonical state encoding. 75 | 76 | Returns 77 | ------- 78 | integer : the dimension of the canonical state encoding 79 | """ 80 | raise NotImplementedError 81 | 82 | # =========================================================================== 83 | # SUCCESSOR FEATURES 84 | # =========================================================================== 85 | def features(self, state, action, next_state): 86 | """ 87 | Computes the state features for the current environment, used for learning successor 88 | feature representations. First introduced in [1]. 89 | 90 | Parameters 91 | ---------- 92 | state : object 93 | the state of the MDP 94 | action : integer 95 | the action selected in the state 96 | next_state : object 97 | the next state (successor state) of the MDP 98 | 99 | Returns 100 | ------- 101 | np.ndarray : the state features of the transition 102 | 103 | References 104 | ---------- 105 | [1] Dayan, Peter. "Improving generalization for temporal difference learning: 106 | The successor representation." Neural Computation 5.4 (1993): 613-624. 107 | """ 108 | raise NotImplementedError 109 | 110 | def feature_dim(self): 111 | """ 112 | Returns the dimension of the state feature representation. 113 | 114 | Returns 115 | ------- 116 | integer : the dimension of the state feature representation 117 | """ 118 | raise NotImplementedError 119 | 120 | def get_w(self): 121 | """ 122 | Returns a vector of parameters that represents the reward function for the current task. 123 | Mathematically, given the state features phi(s,a,s') and reward parameters w, the reward function 124 | is represented as r(s,a,s') = < phi(s,a,s'), w >. 125 | 126 | Returns 127 | ------- 128 | np.ndarray : a linear parameterization of the reward function of the current MDP 129 | """ 130 | raise NotImplementedError 131 | 132 | -------------------------------------------------------------------------------- /source/features/deep.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | from tensorflow.keras import backend as K, Model 4 | from tensorflow.keras.layers import concatenate, Input, Lambda 5 | 6 | from features.successor import SF 7 | 8 | 9 | class DeepSF(SF): 10 | """ 11 | A successor feature representation implemented using Keras. Accepts a wide variety of neural networks as 12 | function approximators. 13 | """ 14 | 15 | def __init__(self, keras_model_handle, *args, target_update_ev=1000, **kwargs): 16 | """ 17 | Creates a new deep representation of successor features. 18 | 19 | Parameters 20 | ---------- 21 | keras_model_handle : function 22 | a function from an input tensor to a compiled Keras model for successor features 23 | the Keras model must have outputs reshaped to [None, n_actions, n_features], where 24 | None corresponds to the batch dimension 25 | n_actions is the number of actions of the MDP 26 | n_features is the number of state features to learn SFs 27 | target_update_ev : integer 28 | how often to update the target network, measured by the number of training calls 29 | """ 30 | super(DeepSF, self).__init__(*args, **kwargs) 31 | self.keras_model_handle = keras_model_handle 32 | self.target_update_ev = target_update_ev 33 | 34 | def reset(self): 35 | SF.reset(self) 36 | self.updates_since_target_updated = [] 37 | 38 | def build_successor(self, task, source=None): 39 | 40 | # input tensor for all networks is shared 41 | if self.n_tasks == 0: 42 | self.n_actions = task.action_count() 43 | self.n_features = task.feature_dim() 44 | self.inputs = Input(shape=(task.encode_dim(),)) 45 | 46 | # build SF network and copy its weights from previous task 47 | # output shape is assumed to be [n_batch, n_actions, n_features] 48 | model = self.keras_model_handle(self.inputs) 49 | if source is not None and self.n_tasks > 0: 50 | source_psi, _ = self.psi[source] 51 | model.set_weights(source_psi.get_weights()) 52 | 53 | # append predictions of all SF networks across tasks to allow fast prediction 54 | expand_output = Lambda(lambda x: K.expand_dims(x, axis=1))(model.output) 55 | if self.n_tasks == 0: 56 | self.all_outputs = expand_output 57 | else: 58 | self.all_outputs = concatenate([self.all_outputs, expand_output], axis=1) 59 | self.all_output_model = Model(inputs=self.inputs, outputs=self.all_outputs) 60 | self.all_output_model.compile('sgd', 'mse') # dummy compile so Keras doesn't complain 61 | 62 | # build target model and copy the weights 63 | target_model = self.keras_model_handle(self.inputs) 64 | target_model.set_weights(model.get_weights()) 65 | self.updates_since_target_updated.append(0) 66 | 67 | return model, target_model 68 | 69 | def get_successor(self, state, policy_index): 70 | psi, _ = self.psi[policy_index] 71 | return psi.predict_on_batch(state) 72 | 73 | def get_successors(self, state): 74 | return self.all_output_model.predict_on_batch(state) 75 | 76 | def update_successor(self, transitions, policy_index): 77 | if transitions is None: 78 | return 79 | states, actions, phis, next_states, gammas = transitions 80 | n_batch = len(gammas) 81 | indices = np.arange(n_batch) 82 | gammas = gammas.reshape((-1, 1)) 83 | 84 | # next actions come from GPI 85 | q1, _ = self.GPI(next_states, policy_index) 86 | next_actions = np.argmax(np.max(q1, axis=1), axis=-1) 87 | 88 | # compute the targets and TD errors 89 | psi, target_psi = self.psi[policy_index] 90 | current_psi = psi.predict_on_batch(states) 91 | targets = phis + gammas * target_psi.predict_on_batch(next_states)[indices, next_actions,:] 92 | 93 | # train the SF network 94 | current_psi[indices, actions,:] = targets 95 | psi.train_on_batch(states, current_psi) 96 | 97 | # update the target SF network 98 | self.updates_since_target_updated[policy_index] += 1 99 | if self.updates_since_target_updated[policy_index] >= self.target_update_ev: 100 | target_psi.set_weights(psi.get_weights()) 101 | self.updates_since_target_updated[policy_index] = 0 102 | 103 | -------------------------------------------------------------------------------- /source/agents/sfdqn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import random 3 | import numpy as np 4 | 5 | from agents.agent import Agent 6 | 7 | 8 | class SFDQN(Agent): 9 | 10 | def __init__(self, deep_sf, buffer, *args, use_gpi=True, test_epsilon=0.03, **kwargs): 11 | """ 12 | Creates a new SFDQN agent per the specifications in the original paper. 13 | 14 | Parameters 15 | ---------- 16 | deep_sf : DeepSF 17 | instance of deep successor feature representation 18 | buffer : ReplayBuffer 19 | a replay buffer that implements randomized experience replay 20 | use_gpi : boolean 21 | whether or not to use transfer learning (defaults to True) 22 | test_epsilon : float 23 | the exploration parameter for epsilon greedy used during testing 24 | (defaults to 0.03 as in the paper) 25 | """ 26 | super(SFDQN, self).__init__(*args, **kwargs) 27 | self.sf = deep_sf 28 | self.buffer = buffer 29 | self.use_gpi = use_gpi 30 | self.test_epsilon = test_epsilon 31 | 32 | def get_Q_values(self, s, s_enc): 33 | q, c = self.sf.GPI(s_enc, self.task_index, update_counters=self.use_gpi) 34 | if not self.use_gpi: 35 | c = self.task_index 36 | self.c = c 37 | return q[:, c,:] 38 | 39 | def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma): 40 | 41 | # update w 42 | phi = self.phi(s, a, s1) 43 | self.sf.update_reward(phi, r, self.task_index) 44 | 45 | # remember this experience 46 | self.buffer.append(s_enc, a, phi, s1_enc, gamma) 47 | 48 | # update SFs 49 | transitions = self.buffer.replay() 50 | for index in range(self.n_tasks): 51 | self.sf.update_successor(transitions, index) 52 | 53 | def reset(self): 54 | super(SFDQN, self).reset() 55 | self.sf.reset() 56 | self.buffer.reset() 57 | 58 | def add_training_task(self, task): 59 | super(SFDQN, self).add_training_task(task) 60 | self.sf.add_training_task(task, source=None) 61 | 62 | def get_progress_strings(self): 63 | sample_str, reward_str = super(SFDQN, self).get_progress_strings() 64 | gpi_percent = self.sf.GPI_usage_percent(self.task_index) 65 | w_error = np.linalg.norm(self.sf.fit_w[self.task_index] - self.sf.true_w[self.task_index]) 66 | gpi_str = 'GPI% \t {:.4f} \t w_err \t {:.4f}'.format(gpi_percent, w_error) 67 | return sample_str, reward_str, gpi_str 68 | 69 | def train(self, train_tasks, n_samples, viewers=None, n_view_ev=None, test_tasks=[], n_test_ev=1000): 70 | if viewers is None: 71 | viewers = [None] * len(train_tasks) 72 | 73 | # add tasks 74 | self.reset() 75 | for train_task in train_tasks: 76 | self.add_training_task(train_task) 77 | 78 | # train each one 79 | return_data = [] 80 | for index, (train_task, viewer) in enumerate(zip(train_tasks, viewers)): 81 | self.set_active_training_task(index) 82 | for t in range(n_samples): 83 | 84 | # train 85 | self.next_sample(viewer, n_view_ev) 86 | 87 | # test 88 | if t % n_test_ev == 0: 89 | Rs = [] 90 | for test_task in test_tasks: 91 | R = self.test_agent(test_task) 92 | Rs.append(R) 93 | print('test performance: {}'.format('\t'.join(map('{:.4f}'.format, Rs)))) 94 | avg_R = np.mean(Rs) 95 | return_data.append(avg_R) 96 | return return_data 97 | 98 | def get_test_action(self, s_enc, w): 99 | if random.random() <= self.test_epsilon: 100 | a = random.randrange(self.n_actions) 101 | else: 102 | q, c = self.sf.GPI_w(s_enc, w) 103 | q = q[:, c,:] 104 | a = np.argmax(q) 105 | return a 106 | 107 | def test_agent(self, task): 108 | R = 0.0 109 | w = task.get_w() 110 | s = task.initialize() 111 | s_enc = self.encoding(s) 112 | for _ in range(self.T): 113 | a = self.get_test_action(s_enc, w) 114 | s1, r, done = task.transition(a) 115 | s1_enc = self.encoding(s1) 116 | s, s_enc = s1, s1_enc 117 | R += r 118 | if done: 119 | break 120 | return R 121 | 122 | -------------------------------------------------------------------------------- /source/agents/dqn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | import random 4 | 5 | from agents.agent import Agent 6 | 7 | 8 | class DQN(Agent): 9 | 10 | def __init__(self, model_lambda, buffer, *args, target_update_ev=1000, test_epsilon=0.03, **kwargs): 11 | """ 12 | Creates a new DQN agent that supports universal value function approximation (UVFA). 13 | 14 | Parameters 15 | ---------- 16 | model_lambda : function 17 | returns a keras Model instance 18 | buffer : ReplayBuffer 19 | a replay buffer that implements randomized experience replay 20 | target_update_ev : integer 21 | how often to update the target network (defaults to 1000) 22 | test_epsilon : float 23 | the exploration parameter for epsilon greedy used during testing 24 | (defaults to 0.03 as in the paper) 25 | """ 26 | super(DQN, self).__init__(*args, **kwargs) 27 | self.model_lambda = model_lambda 28 | self.buffer = buffer 29 | self.target_update_ev = target_update_ev 30 | self.test_epsilon = test_epsilon 31 | 32 | def reset(self): 33 | Agent.reset(self) 34 | self.Q = self.model_lambda() 35 | self.target_Q = self.model_lambda() 36 | self.target_Q.set_weights(self.Q.get_weights()) 37 | self.buffer.reset() 38 | self.updates_since_target_updated = 0 39 | 40 | def get_Q_values(self, s, s_enc): 41 | return self.Q.predict_on_batch(s_enc) 42 | 43 | def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma): 44 | 45 | # remember this experience 46 | self.buffer.append(s_enc, a, r, s1_enc, gamma) 47 | 48 | # sample experience at random 49 | batch = self.buffer.replay() 50 | if batch is None: return 51 | states, actions, rewards, next_states, gammas = batch 52 | n_batch = self.buffer.n_batch 53 | indices = np.arange(n_batch) 54 | rewards = rewards.flatten() 55 | 56 | # main update 57 | next_actions = np.argmax(self.Q.predict_on_batch(next_states), axis=1) 58 | targets = self.Q.predict_on_batch(states) 59 | targets[indices, actions] = rewards + \ 60 | gammas * self.target_Q.predict_on_batch(next_states)[indices, next_actions] 61 | self.Q.train_on_batch(states, targets) 62 | 63 | # target update 64 | self.updates_since_target_updated += 1 65 | if self.updates_since_target_updated >= self.target_update_ev: 66 | self.target_Q.set_weights(self.Q.get_weights()) 67 | self.updates_since_target_updated = 0 68 | 69 | def train(self, train_tasks, n_samples, viewers=None, n_view_ev=None, test_tasks=[], n_test_ev=1000): 70 | if viewers is None: 71 | viewers = [None] * len(train_tasks) 72 | 73 | # add tasks 74 | self.reset() 75 | for train_task in train_tasks: 76 | self.add_training_task(train_task) 77 | 78 | # train each one 79 | return_data = [] 80 | for index, (train_task, viewer) in enumerate(zip(train_tasks, viewers)): 81 | self.set_active_training_task(index) 82 | for t in range(n_samples): 83 | 84 | # train 85 | self.next_sample(viewer, n_view_ev) 86 | 87 | # test 88 | if t % n_test_ev == 0: 89 | Rs = [] 90 | for test_task in test_tasks: 91 | R = self.test_agent(test_task) 92 | Rs.append(R) 93 | avg_R = np.mean(Rs) 94 | return_data.append(avg_R) 95 | print('test performance: {}'.format('\t'.join(map('{:.4f}'.format, Rs)))) 96 | return return_data 97 | 98 | def get_test_action(self, s_enc): 99 | if random.random() <= self.test_epsilon: 100 | a = random.randrange(self.n_actions) 101 | else: 102 | q = self.get_Q_values(s_enc, s_enc) 103 | a = np.argmax(q) 104 | return a 105 | 106 | def test_agent(self, task): 107 | R = 0. 108 | s = task.initialize() 109 | s_enc = self.encoding(s) 110 | for _ in range(self.T): 111 | a = self.get_test_action(s_enc) 112 | s1, r, done = task.transition(a) 113 | s1_enc = self.encoding(s1) 114 | s, s_enc = s1, s1_enc 115 | R += r 116 | if done: 117 | break 118 | return R 119 | -------------------------------------------------------------------------------- /source/main_sfdqn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | import tensorflow as tf 6 | tf.compat.v1.disable_eager_execution() 7 | for gpu in tf.config.experimental.list_physical_devices('GPU'): 8 | tf.config.experimental.set_memory_growth(gpu, True) 9 | from tensorflow.keras import layers, Model, optimizers 10 | 11 | from agents.dqn import DQN 12 | from agents.sfdqn import SFDQN 13 | from agents.buffer import ReplayBuffer 14 | from features.deep import DeepSF 15 | from tasks.reacher import Reacher 16 | from utils.config import parse_config_file 17 | 18 | # read parameters from config file 19 | config_params = parse_config_file('reacher.cfg') 20 | 21 | gen_params = config_params['GENERAL'] 22 | n_samples = gen_params['n_samples'] 23 | 24 | task_params = config_params['TASK'] 25 | goals = task_params['train_targets'] 26 | test_goals = task_params['test_targets'] 27 | all_goals = goals + test_goals 28 | 29 | agent_params = config_params['AGENT'] 30 | dqn_params = config_params['DQN'] 31 | sfdqn_params = config_params['SFDQN'] 32 | 33 | 34 | # tasks 35 | def generate_tasks(include_target): 36 | train_tasks = [Reacher(all_goals, i, include_target) for i in range(len(goals))] 37 | test_tasks = [Reacher(all_goals, i + len(goals), include_target) for i in range(len(test_goals))] 38 | return train_tasks, test_tasks 39 | 40 | 41 | # keras model 42 | def dqn_model_lambda(): 43 | keras_params = dqn_params['keras_params'] 44 | x = y = layers.Input(6) 45 | for n_neurons, activation in zip(keras_params['n_neurons'], keras_params['activations']): 46 | y = layers.Dense(n_neurons, activation=activation)(y) 47 | y = layers.Dense(9, activation='linear')(y) 48 | model = Model(inputs=x, outputs=y) 49 | sgd = optimizers.Adam(learning_rate=keras_params['learning_rate']) 50 | model.compile(sgd, 'mse') 51 | return model 52 | 53 | 54 | # keras model for the SF 55 | def sf_model_lambda(x): 56 | n_features = len(all_goals) 57 | keras_params = sfdqn_params['keras_params'] 58 | y = x 59 | for n_neurons, activation in zip(keras_params['n_neurons'], keras_params['activations']): 60 | y = layers.Dense(n_neurons, activation=activation)(y) 61 | y = layers.Dense(9 * n_features, activation='linear')(y) 62 | y = layers.Reshape((9, n_features))(y) 63 | model = Model(inputs=x, outputs=y) 64 | sgd = optimizers.Adam(learning_rate=keras_params['learning_rate']) 65 | model.compile(sgd, 'mse') 66 | return model 67 | 68 | 69 | def train(): 70 | 71 | # build SFDQN 72 | print('building SFDQN') 73 | deep_sf = DeepSF(keras_model_handle=sf_model_lambda, **sfdqn_params) 74 | sfdqn = SFDQN(deep_sf=deep_sf, buffer=ReplayBuffer(sfdqn_params['buffer_params']), 75 | **sfdqn_params, **agent_params) 76 | 77 | # train SFDQN 78 | print('training SFDQN') 79 | train_tasks, test_tasks = generate_tasks(False) 80 | sfdqn_perf = sfdqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev']) 81 | 82 | # build DQN 83 | print('building DQN') 84 | dqn = DQN(model_lambda=dqn_model_lambda, buffer=ReplayBuffer(dqn_params['buffer_params']), 85 | **dqn_params, **agent_params) 86 | 87 | # training DQN 88 | print('training DQN') 89 | train_tasks, test_tasks = generate_tasks(True) 90 | dqn_perf = dqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev']) 91 | 92 | # smooth data 93 | def smooth(y, box_pts): 94 | return np.convolve(y, np.ones(box_pts) / box_pts, mode='same') 95 | 96 | sfdqn_perf = smooth(sfdqn_perf, 10)[:-5] 97 | dqn_perf = smooth(dqn_perf, 10)[:-5] 98 | x = np.linspace(0, 4, sfdqn_perf.size) 99 | 100 | # reporting progress 101 | ticksize = 14 102 | textsize = 18 103 | plt.rc('font', size=textsize) # controls default text sizes 104 | plt.rc('axes', titlesize=textsize) # fontsize of the axes title 105 | plt.rc('axes', labelsize=textsize) # fontsize of the x and y labels 106 | plt.rc('xtick', labelsize=ticksize) # fontsize of the tick labels 107 | plt.rc('ytick', labelsize=ticksize) # fontsize of the tick labels 108 | plt.rc('legend', fontsize=ticksize) # legend fontsize 109 | 110 | plt.figure(figsize=(8, 6)) 111 | ax = plt.gca() 112 | ax.plot(x, sfdqn_perf, label='SFDQN') 113 | ax.plot(x, dqn_perf, label='DQN') 114 | plt.xlabel('training task index') 115 | plt.ylabel('averaged test episode reward') 116 | plt.title('Testing Reward Averaged over all Test Tasks') 117 | plt.tight_layout() 118 | plt.legend(frameon=False) 119 | plt.savefig('figures/sfdqn_return.png') 120 | 121 | 122 | train() 123 | -------------------------------------------------------------------------------- /source/tasks/reacher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | from pybulletgym.envs.roboschool.robots.robot_bases import MJCFBasedRobot 4 | from pybulletgym.envs.roboschool.envs.env_bases import BaseBulletEnv 5 | from pybulletgym.envs.roboschool.scenes.scene_bases import SingleRobotEmptyScene 6 | 7 | from tasks.task import Task 8 | 9 | 10 | class Reacher(Task): 11 | 12 | def __init__(self, target_positions, task_index, include_target_in_state=False): 13 | self.target_positions = target_positions 14 | self.task_index = task_index 15 | self.target_pos = target_positions[task_index] 16 | self.include_target_in_state = include_target_in_state 17 | self.env = ReacherBulletEnv(self.target_pos) 18 | 19 | # make the action lookup from integer to real action 20 | actions = [-1., 0., 1.] 21 | self.action_dict = dict() 22 | for a1 in actions: 23 | for a2 in actions: 24 | self.action_dict[len(self.action_dict)] = (a1, a2) 25 | 26 | def clone(self): 27 | return Reacher(self.target_positions, self.task_index, self.include_target_in_state) 28 | 29 | def initialize(self): 30 | # if self.task_index == 0: 31 | # self.env.render('human') 32 | state = self.env.reset() 33 | if self.include_target_in_state: 34 | return np.concatenate([state.flatten(), self.target_pos]) 35 | else: 36 | return state 37 | 38 | def action_count(self): 39 | return len(self.action_dict) 40 | 41 | def transition(self, action): 42 | real_action = self.action_dict[action] 43 | new_state, reward, done, _ = self.env.step(real_action) 44 | 45 | if self.include_target_in_state: 46 | return_state = np.concatenate([new_state, self.target_pos]) 47 | else: 48 | return_state = new_state 49 | 50 | return return_state, reward, done 51 | 52 | # =========================================================================== 53 | # STATE ENCODING FOR DEEP LEARNING 54 | # =========================================================================== 55 | def encode(self, state): 56 | return np.array(state).reshape((1, -1)) 57 | 58 | def encode_dim(self): 59 | if self.include_target_in_state: 60 | return 6 61 | else: 62 | return 4 63 | 64 | # =========================================================================== 65 | # SUCCESSOR FEATURES 66 | # =========================================================================== 67 | def features(self, state, action, next_state): 68 | phi = np.zeros((len(self.target_positions),)) 69 | for index, target in enumerate(self.target_positions): 70 | delta = np.linalg.norm(np.array(self.env.robot.fingertip.pose().xyz()[:2]) - np.array(target)) 71 | phi[index] = 1. - 4. * delta 72 | return phi 73 | 74 | def feature_dim(self): 75 | return len(self.target_positions) 76 | 77 | def get_w(self): 78 | w = np.zeros((len(self.target_positions), 1)) 79 | w[self.task_index, 0] = 1.0 80 | return w 81 | 82 | 83 | class ReacherBulletEnv(BaseBulletEnv): 84 | 85 | def __init__(self, target): 86 | self.robot = ReacherRobot(target) 87 | BaseBulletEnv.__init__(self, self.robot) 88 | 89 | def create_single_player_scene(self, bullet_client): 90 | return SingleRobotEmptyScene(bullet_client, gravity=0.0, timestep=0.0165, frame_skip=1) 91 | 92 | def step(self, a): 93 | assert (not self.scene.multiplayer) 94 | self.robot.apply_action(a) 95 | self.scene.global_step() 96 | 97 | state = self.robot.calc_state() # sets self.to_target_vec 98 | 99 | delta = np.linalg.norm( 100 | np.array(self.robot.fingertip.pose().xyz()) - np.array(self.robot.target.pose().xyz())) 101 | reward = 1. - 4. * delta 102 | self.HUD(state, a, False) 103 | 104 | return state, reward, False, {} 105 | 106 | def camera_adjust(self): 107 | x, y, z = self.robot.fingertip.pose().xyz() 108 | x *= 0.5 109 | y *= 0.5 110 | self.camera.move_and_look_at(0.3, 0.3, 0.3, x, y, z) 111 | 112 | 113 | class ReacherRobot(MJCFBasedRobot): 114 | TARG_LIMIT = 0.27 115 | 116 | def __init__(self, target): 117 | MJCFBasedRobot.__init__(self, 'reacher.xml', 'body0', action_dim=2, obs_dim=4) 118 | self.target_pos = target 119 | 120 | def robot_specific_reset(self, bullet_client): 121 | self.jdict["target_x"].reset_current_position(self.target_pos[0], 0) 122 | self.jdict["target_y"].reset_current_position(self.target_pos[1], 0) 123 | self.fingertip = self.parts["fingertip"] 124 | self.target = self.parts["target"] 125 | self.central_joint = self.jdict["joint0"] 126 | self.elbow_joint = self.jdict["joint1"] 127 | self.central_joint.reset_current_position(self.np_random.uniform(low=-3.14, high=3.14), 0) 128 | self.elbow_joint.reset_current_position(self.np_random.uniform(low=-3.14 / 2, high=3.14 / 2), 0) 129 | 130 | def apply_action(self, a): 131 | assert (np.isfinite(a).all()) 132 | self.central_joint.set_motor_torque(0.05 * float(np.clip(a[0], -1, +1))) 133 | self.elbow_joint.set_motor_torque(0.05 * float(np.clip(a[1], -1, +1))) 134 | 135 | def calc_state(self): 136 | theta, self.theta_dot = self.central_joint.current_relative_position() 137 | self.gamma, self.gamma_dot = self.elbow_joint.current_relative_position() 138 | # target_x, _ = self.jdict["target_x"].current_position() 139 | # target_y, _ = self.jdict["target_y"].current_position() 140 | self.to_target_vec = np.array(self.fingertip.pose().xyz()) - np.array(self.target.pose().xyz()) 141 | return np.array([ 142 | theta, 143 | self.theta_dot, 144 | self.gamma, 145 | self.gamma_dot 146 | ]) 147 | # 148 | # def calc_potential(self): 149 | # return -100 * np.linalg.norm(self.to_target_vec) 150 | 151 | -------------------------------------------------------------------------------- /source/tasks/gridworld.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | import random 4 | 5 | from tasks.task import Task 6 | 7 | 8 | class Shapes(Task): 9 | """ 10 | A discretized version of the gridworld environment introduced in [1]. Here, an agent learns to 11 | collect shapes with positive reward, while avoid those with negative reward, and then travel to a fixed goal. 12 | The gridworld is split into four rooms separated by walls with passage-ways. 13 | 14 | References 15 | ---------- 16 | [1] Barreto, Andr�, et al. "Successor Features for Transfer in Reinforcement Learning." NIPS. 2017. 17 | """ 18 | 19 | LEFT, UP, RIGHT, DOWN = 0, 1, 2, 3 20 | 21 | def __init__(self, maze, shape_rewards): 22 | """ 23 | Creates a new instance of the shapes environment. 24 | 25 | Parameters 26 | ---------- 27 | maze : np.ndarray 28 | an array of string values representing the type of each cell in the environment: 29 | G indicates a goal state (terminal state) 30 | _ indicates an initial state (there can be multiple, and one is selected at random 31 | at the start of each episode) 32 | X indicates a barrier 33 | 0, 1, .... 9 indicates the type of shape to be placed in the corresponding cell 34 | entries containing other characters are treated as regular empty cells 35 | shape_rewards : dict 36 | a dictionary mapping the type of shape (0, 1, ... ) to a corresponding reward to provide 37 | to the agent for collecting an object of that type 38 | """ 39 | self.height, self.width = maze.shape 40 | self.maze = maze 41 | self.shape_rewards = shape_rewards 42 | shape_types = sorted(list(shape_rewards.keys())) 43 | self.all_shapes = dict(zip(shape_types, range(len(shape_types)))) 44 | 45 | self.goal = None 46 | self.initial = [] 47 | self.occupied = set() 48 | self.shape_ids = dict() 49 | for c in range(self.width): 50 | for r in range(self.height): 51 | if maze[r, c] == 'G': 52 | self.goal = (r, c) 53 | elif maze[r, c] == '_': 54 | self.initial.append((r, c)) 55 | elif maze[r, c] == 'X': 56 | self.occupied.add((r, c)) 57 | elif maze[r, c] in {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}: 58 | self.shape_ids[(r, c)] = len(self.shape_ids) 59 | 60 | def clone(self): 61 | return Shapes(self.maze, self.shape_rewards) 62 | 63 | def initialize(self): 64 | self.state = (random.choice(self.initial), tuple(0 for _ in range(len(self.shape_ids)))) 65 | return self.state 66 | 67 | def action_count(self): 68 | return 4 69 | 70 | def transition(self, action): 71 | (row, col), collected = self.state 72 | 73 | # perform the movement 74 | if action == Shapes.LEFT: 75 | col -= 1 76 | elif action == Shapes.UP: 77 | row -= 1 78 | elif action == Shapes.RIGHT: 79 | col += 1 80 | elif action == Shapes.DOWN: 81 | row += 1 82 | else: 83 | raise Exception('bad action {}'.format(action)) 84 | 85 | # out of bounds, cannot move 86 | if col < 0 or col >= self.width or row < 0 or row >= self.height: 87 | return self.state, 0., False 88 | 89 | # into a blocked cell, cannot move 90 | s1 = (row, col) 91 | if s1 in self.occupied: 92 | return self.state, 0., False 93 | 94 | # can now move 95 | self.state = (s1, collected) 96 | 97 | # into a goal cell 98 | if s1 == self.goal: 99 | return self.state, 1., True 100 | 101 | # into a shape cell 102 | if s1 in self.shape_ids: 103 | shape_id = self.shape_ids[s1] 104 | if collected[shape_id] == 1: 105 | 106 | # already collected this flag 107 | return self.state, 0., False 108 | else: 109 | 110 | # collect the new flag 111 | collected = list(collected) 112 | collected[shape_id] = 1 113 | collected = tuple(collected) 114 | self.state = (s1, collected) 115 | reward = self.shape_rewards[self.maze[row, col]] 116 | return self.state, reward, False 117 | 118 | # into an empty cell 119 | return self.state, 0., False 120 | 121 | # =========================================================================== 122 | # STATE ENCODING FOR DEEP LEARNING 123 | # =========================================================================== 124 | def encode(self, state): 125 | (y, x), coll = state 126 | n_state = self.width + self.height 127 | result = np.zeros((n_state + len(coll),)) 128 | result[y] = 1 129 | result[self.height + x] = 1 130 | result[n_state:] = np.array(coll) 131 | result = result.reshape((1, -1)) 132 | return result 133 | 134 | def encode_dim(self): 135 | return self.width + self.height + len(self.shape_ids) 136 | 137 | # =========================================================================== 138 | # SUCCESSOR FEATURES 139 | # =========================================================================== 140 | def features(self, state, action, next_state): 141 | s1, _ = next_state 142 | _, collected = state 143 | nc = len(self.all_shapes) 144 | phi = np.zeros((nc + 1,)) 145 | if s1 in self.shape_ids: 146 | if collected[self.shape_ids[s1]] != 1: 147 | y, x = s1 148 | shape_index = self.all_shapes[self.maze[y, x]] 149 | phi[shape_index] = 1. 150 | elif s1 == self.goal: 151 | phi[nc] = 1. 152 | return phi 153 | 154 | def feature_dim(self): 155 | return len(self.all_shapes) + 1 156 | 157 | def get_w(self): 158 | ns = len(self.all_shapes) 159 | w = np.zeros((ns + 1, 1)) 160 | for shape, shape_index in self.all_shapes.items(): 161 | w[shape_index, 0] = self.shape_rewards[shape] 162 | w[ns, 0] = 1. 163 | return w 164 | 165 | -------------------------------------------------------------------------------- /source/agents/agent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import random 3 | import numpy as np 4 | 5 | 6 | class Agent: 7 | 8 | def __init__(self, gamma, T, encoding, *args, epsilon=0.1, epsilon_decay=1., epsilon_min=0., 9 | print_ev=1000, save_ev=100, **kwargs): 10 | """ 11 | Creates a new abstract reinforcement learning agent. 12 | 13 | Parameters 14 | ---------- 15 | gamma : float 16 | the discount factor in [0, 1] 17 | T : integer 18 | the maximum length of an episode 19 | encoding : function 20 | encodes the state of the task instance into a numpy array 21 | epsilon : float 22 | the initial exploration parameter for epsilon greedy (defaults to 0.1) 23 | epsilon_decay : float 24 | the amount to anneal epsilon in each time step (defaults to 1, no annealing) 25 | epsilon_min : float 26 | the minimum allowed value of epsilon (defaults to 0) 27 | print_ev : integer 28 | how often to print learning progress 29 | save_ev : 30 | how often to save learning progress to internal memory 31 | """ 32 | self.gamma = gamma 33 | self.T = T 34 | if encoding is None: 35 | encoding = lambda s: s 36 | self.encoding = encoding 37 | self.epsilon_init = epsilon 38 | self.epsilon_decay = epsilon_decay 39 | self.epsilon_min = epsilon_min 40 | self.print_ev = print_ev 41 | self.save_ev = save_ev 42 | if len(args) != 0 or len(kwargs) != 0: 43 | print(self.__class__.__name__ + ' ignoring parameters ' + str(args) + ' and ' + str(kwargs)) 44 | 45 | def get_Q_values(self, s, s_enc): 46 | """ 47 | Returns the value function evaluated in the specified state. 48 | An array of size [n_batch, n_actions], where: 49 | n_batch is the number of states provided 50 | n_actions is the number of possible actions in the current task 51 | 52 | Parameters 53 | ---------- 54 | s : iterable of object 55 | raw states of the task 56 | s_enc : np.ndarray 57 | collection of encoded states of the shape [n_batch, -1] 58 | 59 | Returns 60 | ------- 61 | np.ndarray : array of the shape [n_batch, n_actions] returning the estimated 62 | Q-values of the current instance 63 | """ 64 | raise NotImplementedError 65 | 66 | def train_agent(self, s, s_enc, a, r, s1, s1_enc, gamma): 67 | """ 68 | Trains the current agent on the provided transition. 69 | 70 | Parameters 71 | ---------- 72 | s : object 73 | the raw state of the task 74 | s_enc : np.ndarray 75 | the encoded state of the task 76 | a : integer 77 | the action taken in state s 78 | r : float 79 | the reward obtained in the current transition 80 | s1 : object 81 | the raw successor state of the task 82 | s1_enc : np.ndarray 83 | the encoded next state s1 of the task 84 | gamma : float 85 | discount factor to apply to the transition - should be zero if a terminal transition 86 | """ 87 | raise NotImplementedError 88 | 89 | # =========================================================================== 90 | # TASK MANAGEMENT 91 | # =========================================================================== 92 | def reset(self): 93 | """ 94 | Resets the agent, including all value functions and internal memory/history. 95 | """ 96 | self.tasks = [] 97 | self.phis = [] 98 | 99 | # reset counter history 100 | self.cum_reward = 0. 101 | self.reward_hist = [] 102 | self.cum_reward_hist = [] 103 | 104 | def add_training_task(self, task): 105 | """ 106 | Adds a training task to be trained by the agent. 107 | """ 108 | self.tasks.append(task) 109 | self.n_tasks = len(self.tasks) 110 | self.phis.append(task.features) 111 | if self.n_tasks == 1: 112 | self.n_actions = task.action_count() 113 | self.n_features = task.feature_dim() 114 | if self.encoding == 'task': 115 | self.encoding = task.encode 116 | 117 | def set_active_training_task(self, index): 118 | """ 119 | Sets the task at the requested index as the current task the agent will train on. 120 | The index is based on the order in which the training task was added to the agent. 121 | """ 122 | 123 | # set the task 124 | self.task_index = index 125 | self.active_task = self.tasks[index] 126 | self.phi = self.phis[index] 127 | 128 | # reset task-dependent counters 129 | self.s = self.s_enc = None 130 | self.new_episode = True 131 | self.episode, self.episode_reward = 0, 0. 132 | self.steps_since_last_episode, self.reward_since_last_episode = 0, 0. 133 | self.steps, self.reward = 0, 0. 134 | self.epsilon = self.epsilon_init 135 | self.episode_reward_hist = [] 136 | 137 | # =========================================================================== 138 | # TRAINING 139 | # =========================================================================== 140 | def _epsilon_greedy(self, q): 141 | assert q.size == self.n_actions 142 | 143 | # sample from a Bernoulli distribution with parameter epsilon 144 | if random.random() <= self.epsilon: 145 | a = random.randrange(self.n_actions) 146 | else: 147 | a = np.argmax(q) 148 | 149 | # decrease the exploration gradually 150 | self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min) 151 | 152 | return a 153 | 154 | def get_progress_strings(self): 155 | """ 156 | Returns a string that displays the agent's learning progress. This includes things like 157 | the current training task index, steps and episodes of training, exploration parameter, 158 | the previous episode reward obtained and cumulative reward, and other information 159 | depending on the current implementation. 160 | """ 161 | sample_str = 'task \t {} \t steps \t {} \t episodes \t {} \t eps \t {:.4f}'.format( 162 | self.task_index, self.steps, self.episode, self.epsilon) 163 | reward_str = 'ep_reward \t {:.4f} \t reward \t {:.4f}'.format( 164 | self.episode_reward, self.reward) 165 | return sample_str, reward_str 166 | 167 | def next_sample(self, viewer=None, n_view_ev=None): 168 | """ 169 | Updates the agent by performing one interaction with the current training environment. 170 | This function performs all interactions with the environment, data and storage manipulations, 171 | training the agent, and updating all history. 172 | 173 | Parameters 174 | ---------- 175 | viewer : object 176 | a viewer that displays the agent's exploration behavior on the task based on its update() method 177 | (defaults to None) 178 | n_view_ev : integer 179 | how often (in training episodes) to invoke the viewer to display agent's learned behavior 180 | (defaults to None) 181 | """ 182 | 183 | # start a new episode 184 | if self.new_episode: 185 | self.s = self.active_task.initialize() 186 | self.s_enc = self.encoding(self.s) 187 | self.new_episode = False 188 | self.episode += 1 189 | self.steps_since_last_episode = 0 190 | self.episode_reward = self.reward_since_last_episode 191 | self.reward_since_last_episode = 0. 192 | if self.episode > 1: 193 | self.episode_reward_hist.append(self.episode_reward) 194 | 195 | # compute the Q-values in the current state 196 | q = self.get_Q_values(self.s, self.s_enc) 197 | 198 | # choose an action using the epsilon-greedy policy 199 | a = self._epsilon_greedy(q) 200 | 201 | # take action a and observe reward r and next state s' 202 | s1, r, terminal = self.active_task.transition(a) 203 | s1_enc = self.encoding(s1) 204 | if terminal: 205 | gamma = 0. 206 | self.new_episode = True 207 | else: 208 | gamma = self.gamma 209 | 210 | # train the agent 211 | self.train_agent(self.s, self.s_enc, a, r, s1, s1_enc, gamma) 212 | 213 | # update counters 214 | self.s, self.s_enc = s1, s1_enc 215 | self.steps += 1 216 | self.reward += r 217 | self.steps_since_last_episode += 1 218 | self.reward_since_last_episode += r 219 | self.cum_reward += r 220 | 221 | if self.steps_since_last_episode >= self.T: 222 | self.new_episode = True 223 | 224 | if self.steps % self.save_ev == 0: 225 | self.reward_hist.append(self.reward) 226 | self.cum_reward_hist.append(self.cum_reward) 227 | 228 | # viewing 229 | if viewer is not None and self.episode % n_view_ev == 0: 230 | viewer.update() 231 | 232 | # printing 233 | if self.steps % self.print_ev == 0: 234 | print('\t'.join(self.get_progress_strings())) 235 | 236 | def train_on_task(self, train_task, n_samples, viewer=None, n_view_ev=None): 237 | """ 238 | Trains the agent on the current task. 239 | 240 | Parameters 241 | ---------- 242 | train_task : Task 243 | the training task instance 244 | n_samples : integer 245 | how many samples should be generated and used to train the agent 246 | viewer : object 247 | a viewer that displays the agent's exploration behavior on the task based on its update() method 248 | (defaults to None) 249 | n_view_ev : integer 250 | how often (in training episodes) to invoke the viewer to display agent's learned behavior 251 | (defaults to None) 252 | """ 253 | self.add_training_task(train_task) 254 | self.set_active_training_task(self.n_tasks - 1) 255 | for _ in range(n_samples): 256 | self.next_sample(viewer, n_view_ev) 257 | 258 | def train(self, train_tasks, n_samples, viewers=None, n_view_ev=None): 259 | """ 260 | Trains the agent on a set of tasks. 261 | 262 | Parameters 263 | ---------- 264 | train_tasks : iterable of Task 265 | the training task instances 266 | n_samples : integer 267 | how many samples should be generated and used to train the agent on each task 268 | viewer : iterable object 269 | viewers that display the agent's exploration behavior on each task based on their update() methods 270 | (defaults to None) 271 | n_view_ev : integer 272 | how often (in training episodes) to invoke the viewer to display agent's learned behavior 273 | (defaults to None) 274 | """ 275 | if viewers is None: 276 | viewers = [None] * len(train_tasks) 277 | self.reset() 278 | for train_task, viewer in zip(train_tasks, viewers): 279 | self.train_on_task(train_task, n_samples, viewer, n_view_ev) 280 | 281 | -------------------------------------------------------------------------------- /source/features/successor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | 4 | 5 | class SF: 6 | 7 | def __init__(self, learning_rate_w, *args, use_true_reward=False, **kwargs): 8 | """ 9 | Creates a new abstract successor feature representation. 10 | 11 | Parameters 12 | ---------- 13 | learning_rate_w : float 14 | the learning rate to use for learning the reward weights using gradient descent 15 | use_true_reward : boolean 16 | whether or not to use the true reward weights from the environment, or learn them 17 | using gradient descent 18 | """ 19 | self.alpha_w = learning_rate_w 20 | self.use_true_reward = use_true_reward 21 | if len(args) != 0 or len(kwargs) != 0: 22 | print(self.__class__.__name__ + ' ignoring parameters ' + str(args) + ' and ' + str(kwargs)) 23 | 24 | def build_successor(self, task, source=None): 25 | """ 26 | Builds a new successor feature map for the specified task. This method should not be called directly. 27 | Instead, add_task should be called instead. 28 | 29 | Parameters 30 | ---------- 31 | task : Task 32 | a new MDP environment for which to learn successor features 33 | source : integer 34 | if specified and not None, the parameters of the successor features for the task at the source 35 | index should be copied to the new successor features, as suggested in [1] 36 | 37 | Returns 38 | ------- 39 | object : the successor feature representation for the new task, which can be a Keras model, 40 | a lookup table (dictionary) or another learning representation 41 | """ 42 | raise NotImplementedError 43 | 44 | def get_successor(self, state, policy_index): 45 | """ 46 | Evaluates the successor features in given states for the specified task. 47 | 48 | Parameters 49 | ---------- 50 | state : object 51 | a state or collection of states of the MDP 52 | policy_index : integer 53 | the index of the task whose successor features to evaluate 54 | 55 | Returns 56 | ------- 57 | np.ndarray : the evaluation of the successor features, which is of shape 58 | [n_batch, n_actions, n_features], where 59 | n_batch is the number of states in the state argument 60 | n_actions is the number of actions of the MDP 61 | n_features is the number of features in the SF representation 62 | """ 63 | raise NotImplementedError 64 | 65 | def get_successors(self, state): 66 | """ 67 | Evaluates the successor features in given states for all tasks. 68 | 69 | Parameters 70 | ---------- 71 | state : object 72 | a state or collection of states of the MDP 73 | 74 | Returns 75 | ------- 76 | np.ndarray : the evaluation of the successor features, which is of shape 77 | [n_batch, n_tasks, n_actions, n_features], where 78 | n_batch is the number of states in the state argument 79 | n_tasks is the number of tasks 80 | n_actions is the number of actions of the MDP 81 | n_features is the number of features in the SF representation 82 | """ 83 | raise NotImplementedError 84 | 85 | def update_successor(self, transitions, policy_index): 86 | """ 87 | Updates the successor representation by training it on the given transition. 88 | 89 | Parameters 90 | ---------- 91 | transitions : object 92 | collection of transitions 93 | policy_index : integer 94 | the index of the task whose successor features to update 95 | """ 96 | raise NotImplementedError 97 | 98 | def reset(self): 99 | """ 100 | Removes all trained successor feature representations from the current object, all learned rewards, 101 | and all task information. 102 | """ 103 | self.n_tasks = 0 104 | self.psi = [] 105 | self.true_w = [] 106 | self.fit_w = [] 107 | self.gpi_counters = [] 108 | 109 | def add_training_task(self, task, source=None): 110 | """ 111 | Adds a successor feature representation for the specified task. 112 | 113 | Parameters 114 | ---------- 115 | task : Task 116 | a new MDP environment for which to learn successor features 117 | source : integer 118 | if specified and not None, the parameters of the successor features for the task at the source 119 | index should be copied to the new successor features, as suggested in [1] 120 | """ 121 | 122 | # add successor features to the library 123 | self.psi.append(self.build_successor(task, source)) 124 | self.n_tasks = len(self.psi) 125 | 126 | # build new reward function 127 | true_w = task.get_w() 128 | self.true_w.append(true_w) 129 | if self.use_true_reward: 130 | fit_w = true_w 131 | else: 132 | n_features = task.feature_dim() 133 | fit_w = np.random.uniform(low=-0.01, high=0.01, size=(n_features, 1)) 134 | self.fit_w.append(fit_w) 135 | 136 | # add statistics 137 | for i in range(len(self.gpi_counters)): 138 | self.gpi_counters[i] = np.append(self.gpi_counters[i], 0) 139 | self.gpi_counters.append(np.zeros((self.n_tasks,), dtype=int)) 140 | 141 | def update_reward(self, phi, r, task_index, exact=False): 142 | """ 143 | Updates the reward parameters for the given task based on the observed reward sample 144 | from the environment. 145 | 146 | Parameters 147 | ---------- 148 | phi : np.ndarray 149 | the state features 150 | r : float 151 | the observed reward from the MDP 152 | task_index : integer 153 | the index of the task from which this reward was sampled 154 | exact : boolean 155 | if True, validates the true reward from the environment and the linear representation 156 | """ 157 | 158 | # update reward using linear regression 159 | w = self.fit_w[task_index] 160 | phi = phi.reshape(w.shape) 161 | r_fit = np.sum(phi * w) 162 | self.fit_w[task_index] = w + self.alpha_w * (r - r_fit) * phi 163 | 164 | # validate reward 165 | r_true = np.sum(phi * self.true_w[task_index]) 166 | if exact and not np.allclose(r, r_true): 167 | raise Exception('sampled reward {} != linear reward {} - please check task {}!'.format( 168 | r, r_true, task_index)) 169 | 170 | def GPE_w(self, state, policy_index, w): 171 | """ 172 | Implements generalized policy evaluation according to [1]. In summary, this uses the 173 | learned reward parameters of one task and successor features of a policy to estimate the Q-values of 174 | the policy if it were executed in that task. 175 | 176 | Parameters 177 | ---------- 178 | state : object 179 | a state or collection of states of the MDP 180 | policy_index : integer 181 | the index of the task whose policy to evaluate 182 | w : numpy array 183 | reward parameters of the task in which to evaluate the policy 184 | 185 | Returns 186 | ------- 187 | np.ndarray : the estimated Q-values of shape [n_batch, n_actions], where 188 | n_batch is the number of states in the state argument 189 | n_actions is the number of actions in the MDP 190 | """ 191 | psi = self.get_successor(state, policy_index) 192 | q = psi @ w # shape (n_batch, n_actions) 193 | return q 194 | 195 | def GPE(self, state, policy_index, task_index): 196 | """ 197 | Implements generalized policy evaluation according to [1]. In summary, this uses the 198 | learned reward parameters of one task and successor features of a policy to estimate the Q-values of 199 | the policy if it were executed in that task. 200 | 201 | Parameters 202 | ---------- 203 | state : object 204 | a state or collection of states of the MDP 205 | policy_index : integer 206 | the index of the task whose policy to evaluate 207 | task_index : integer 208 | the index of the task (e.g. reward) to use to evaluate the policy 209 | 210 | Returns 211 | ------- 212 | np.ndarray : the estimated Q-values of shpae [n_batch, n_actions], where 213 | n_batch is the number of states in the state argument 214 | n_actions is the number of actions in the MDP 215 | """ 216 | return self.GPE_w(state, policy_index, self.fit_w[task_index]) 217 | 218 | def GPI_w(self, state, w): 219 | """ 220 | Implements generalized policy improvement according to [1]. 221 | 222 | Parameters 223 | ---------- 224 | state : object 225 | a state or collection of states of the MDP 226 | w : numpy array 227 | the reward parameters of the task to control 228 | 229 | Returns 230 | ------- 231 | np.ndarray : the maximum Q-values computed by GPI for selecting actions 232 | of shape [n_batch, n_tasks, n_actions], where: 233 | n_batch is the number of states in the state argument 234 | n_tasks is the number of tasks 235 | n_actions is the number of actions in the MDP 236 | np.ndarray : the tasks that are active in each state of state_batch in GPi 237 | """ 238 | psi = self.get_successors(state) 239 | q = (psi @ w)[:,:,:, 0] # shape (n_batch, n_tasks, n_actions) 240 | task = np.squeeze(np.argmax(np.max(q, axis=2), axis=1)) # shape (n_batch,) 241 | return q, task 242 | 243 | def GPI(self, state, task_index, update_counters=False): 244 | """ 245 | Implements generalized policy improvement according to [1]. 246 | 247 | Parameters 248 | ---------- 249 | state : object 250 | a state or collection of states of the MDP 251 | task_index : integer 252 | the index of the task in which the GPI action will be used 253 | update_counters : boolean 254 | whether or not to keep track of which policies are active in GPI 255 | 256 | Returns 257 | ------- 258 | np.ndarray : the maximum Q-values computed by GPI for selecting actions 259 | of shape [n_batch, n_tasks, n_actions], where: 260 | n_batch is the number of states in the state argument 261 | n_tasks is the number of tasks 262 | n_actions is the number of actions in the MDP 263 | np.ndarray : the tasks that are active in each state of state_batch in GPi 264 | """ 265 | q, task = self.GPI_w(state, self.fit_w[task_index]) 266 | if update_counters: 267 | self.gpi_counters[task_index][task] += 1 268 | return q, task 269 | 270 | def GPI_usage_percent(self, task_index): 271 | """ 272 | Counts the number of times that actions were transferred from other tasks. 273 | 274 | Parameters 275 | ---------- 276 | task_index : integer 277 | the index of the task 278 | 279 | Returns 280 | ------- 281 | float : the (normalized) number of actions that were transferred from other 282 | tasks in GPi. 283 | """ 284 | counts = self.gpi_counters[task_index] 285 | return 1. - (float(counts[task_index]) / np.sum(counts)) 286 | --------------------------------------------------------------------------------