├── irl ├── __init__.py ├── mdp │ ├── __init__.py │ ├── gridworld_test.py │ ├── objectworld.py │ └── gridworld.py ├── value_iteration.py ├── linear_irl.py ├── maxent.py └── deep_maxent.py ├── .gitignore ├── LICENSE ├── examples ├── lp_gridworld.py ├── maxent_gridworld.py ├── lp_large_gridworld.py ├── maxent_objectworld.py ├── deep_maxent_objectworld.py └── experiments.py └── README.md /irl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irl/mdp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Matthew Alger 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /irl/mdp/gridworld_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for the gridworld MDP. 3 | 4 | Matthew Alger, 2016 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import unittest 9 | 10 | import numpy as np 11 | import numpy.random as rn 12 | 13 | import gridworld 14 | 15 | 16 | def make_random_gridworld(): 17 | grid_size = rn.randint(2, 15) 18 | wind = rn.uniform(0.0, 1.0) 19 | discount = rn.uniform(0.0, 1.0) 20 | return gridworld.Gridworld(grid_size, wind, discount) 21 | 22 | 23 | class TestTransitionProbability(unittest.TestCase): 24 | """Tests for Gridworld.transition_probability.""" 25 | 26 | def test_sums_to_one(self): 27 | """Tests that the sum of transition probabilities is approximately 1.""" 28 | # This is a simple fuzz-test. 29 | for _ in range(40): 30 | gw = make_random_gridworld() 31 | self.assertTrue( 32 | np.isclose(gw.transition_probability.sum(axis=2), 1).all(), 33 | 'Probabilities don\'t sum to 1: {}'.format(gw)) 34 | 35 | def test_manual_sums_to_one(self): 36 | """Tests issue #1 on GitHub.""" 37 | gw = gridworld.Gridworld(5, 0.3, 0.2) 38 | self.assertTrue( 39 | np.isclose(gw.transition_probability.sum(axis=2), 1).all()) 40 | 41 | if __name__ == '__main__': 42 | unittest.main() -------------------------------------------------------------------------------- /examples/lp_gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run linear programming inverse reinforcement learning on the gridworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | import irl.linear_irl as linear_irl 12 | import irl.mdp.gridworld as gridworld 13 | 14 | def main(grid_size, discount): 15 | """ 16 | Run linear programming inverse reinforcement learning on the gridworld MDP. 17 | 18 | Plots the reward function. 19 | 20 | grid_size: Grid size. int. 21 | discount: MDP discount factor. float. 22 | """ 23 | 24 | wind = 0.3 25 | trajectory_length = 3*grid_size 26 | 27 | gw = gridworld.Gridworld(grid_size, wind, discount) 28 | 29 | ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) 30 | policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] 31 | r = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability, 32 | policy, gw.discount, 1, 5) 33 | 34 | plt.subplot(1, 2, 1) 35 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 36 | plt.colorbar() 37 | plt.title("Groundtruth reward") 38 | plt.subplot(1, 2, 2) 39 | plt.pcolor(r.reshape((grid_size, grid_size))) 40 | plt.colorbar() 41 | plt.title("Recovered reward") 42 | plt.show() 43 | 44 | if __name__ == '__main__': 45 | main(5, 0.2) 46 | -------------------------------------------------------------------------------- /examples/maxent_gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run maximum entropy inverse reinforcement learning on the gridworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | import irl.maxent as maxent 12 | import irl.mdp.gridworld as gridworld 13 | 14 | def main(grid_size, discount, n_trajectories, epochs, learning_rate): 15 | """ 16 | Run maximum entropy inverse reinforcement learning on the gridworld MDP. 17 | 18 | Plots the reward function. 19 | 20 | grid_size: Grid size. int. 21 | discount: MDP discount factor. float. 22 | n_trajectories: Number of sampled trajectories. int. 23 | epochs: Gradient descent iterations. int. 24 | learning_rate: Gradient descent learning rate. float. 25 | """ 26 | 27 | wind = 0.3 28 | trajectory_length = 3*grid_size 29 | 30 | gw = gridworld.Gridworld(grid_size, wind, discount) 31 | trajectories = gw.generate_trajectories(n_trajectories, 32 | trajectory_length, 33 | gw.optimal_policy) 34 | feature_matrix = gw.feature_matrix() 35 | ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) 36 | r = maxent.irl(feature_matrix, gw.n_actions, discount, 37 | gw.transition_probability, trajectories, epochs, learning_rate) 38 | 39 | plt.subplot(1, 2, 1) 40 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 41 | plt.colorbar() 42 | plt.title("Groundtruth reward") 43 | plt.subplot(1, 2, 2) 44 | plt.pcolor(r.reshape((grid_size, grid_size))) 45 | plt.colorbar() 46 | plt.title("Recovered reward") 47 | plt.show() 48 | 49 | if __name__ == '__main__': 50 | main(5, 0.01, 20, 200, 0.01) 51 | -------------------------------------------------------------------------------- /examples/lp_large_gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run large state space linear programming inverse reinforcement learning on the 3 | gridworld MDP. 4 | 5 | Matthew Alger, 2015 6 | matthew.alger@anu.edu.au 7 | """ 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | import irl.linear_irl as linear_irl 13 | import irl.mdp.gridworld as gridworld 14 | from irl.value_iteration import value 15 | 16 | def main(grid_size, discount): 17 | """ 18 | Run large state space linear programming inverse reinforcement learning on 19 | the gridworld MDP. 20 | 21 | Plots the reward function. 22 | 23 | grid_size: Grid size. int. 24 | discount: MDP discount factor. float. 25 | """ 26 | 27 | wind = 0.3 28 | trajectory_length = 3*grid_size 29 | 30 | gw = gridworld.Gridworld(grid_size, wind, discount) 31 | 32 | ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) 33 | policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] 34 | 35 | # Need a value function for each basis function. 36 | feature_matrix = gw.feature_matrix() 37 | values = [] 38 | for dim in range(feature_matrix.shape[1]): 39 | reward = feature_matrix[:, dim] 40 | values.append(value(policy, gw.n_states, gw.transition_probability, 41 | reward, gw.discount)) 42 | values = np.array(values) 43 | 44 | r = linear_irl.large_irl(values, gw.transition_probability, 45 | feature_matrix, gw.n_states, gw.n_actions, policy) 46 | 47 | plt.subplot(1, 2, 1) 48 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 49 | plt.colorbar() 50 | plt.title("Groundtruth reward") 51 | plt.subplot(1, 2, 2) 52 | plt.pcolor(r.reshape((grid_size, grid_size))) 53 | plt.colorbar() 54 | plt.title("Recovered reward") 55 | plt.show() 56 | 57 | if __name__ == '__main__': 58 | main(10, 0.9) 59 | -------------------------------------------------------------------------------- /examples/maxent_objectworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run maximum entropy inverse reinforcement learning on the objectworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | import irl.maxent as maxent 12 | import irl.mdp.objectworld as objectworld 13 | from irl.value_iteration import find_policy 14 | 15 | def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, 16 | learning_rate): 17 | """ 18 | Run maximum entropy inverse reinforcement learning on the objectworld MDP. 19 | 20 | Plots the reward function. 21 | 22 | grid_size: Grid size. int. 23 | discount: MDP discount factor. float. 24 | n_objects: Number of objects. int. 25 | n_colours: Number of colours. int. 26 | n_trajectories: Number of sampled trajectories. int. 27 | epochs: Gradient descent iterations. int. 28 | learning_rate: Gradient descent learning rate. float. 29 | """ 30 | 31 | wind = 0.3 32 | trajectory_length = 8 33 | 34 | ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, 35 | discount) 36 | ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) 37 | policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, 38 | ground_r, ow.discount, stochastic=False) 39 | trajectories = ow.generate_trajectories(n_trajectories, 40 | trajectory_length, 41 | lambda s: policy[s]) 42 | feature_matrix = ow.feature_matrix(discrete=False) 43 | r = maxent.irl(feature_matrix, ow.n_actions, discount, 44 | ow.transition_probability, trajectories, epochs, learning_rate) 45 | 46 | plt.subplot(1, 2, 1) 47 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 48 | plt.colorbar() 49 | plt.title("Groundtruth reward") 50 | plt.subplot(1, 2, 2) 51 | plt.pcolor(r.reshape((grid_size, grid_size))) 52 | plt.colorbar() 53 | plt.title("Recovered reward") 54 | plt.show() 55 | 56 | if __name__ == '__main__': 57 | main(10, 0.9, 15, 2, 20, 50, 0.01) 58 | -------------------------------------------------------------------------------- /examples/deep_maxent_objectworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run maximum entropy inverse reinforcement learning on the objectworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | import irl.deep_maxent as deep_maxent 12 | import irl.mdp.objectworld as objectworld 13 | from irl.value_iteration import find_policy 14 | 15 | def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, 16 | learning_rate, structure): 17 | """ 18 | Run deep maximum entropy inverse reinforcement learning on the objectworld 19 | MDP. 20 | 21 | Plots the reward function. 22 | 23 | grid_size: Grid size. int. 24 | discount: MDP discount factor. float. 25 | n_objects: Number of objects. int. 26 | n_colours: Number of colours. int. 27 | n_trajectories: Number of sampled trajectories. int. 28 | epochs: Gradient descent iterations. int. 29 | learning_rate: Gradient descent learning rate. float. 30 | structure: Neural network structure. Tuple of hidden layer dimensions, e.g., 31 | () is no neural network (linear maximum entropy) and (3, 4) is two 32 | hidden layers with dimensions 3 and 4. 33 | """ 34 | 35 | wind = 0.3 36 | trajectory_length = 8 37 | l1 = l2 = 0 38 | 39 | ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, 40 | discount) 41 | ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) 42 | policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, 43 | ground_r, ow.discount, stochastic=False) 44 | trajectories = ow.generate_trajectories(n_trajectories, 45 | trajectory_length, 46 | lambda s: policy[s]) 47 | feature_matrix = ow.feature_matrix(discrete=False) 48 | r = deep_maxent.irl((feature_matrix.shape[1],) + structure, feature_matrix, 49 | ow.n_actions, discount, ow.transition_probability, trajectories, epochs, 50 | learning_rate, l1=l1, l2=l2) 51 | 52 | plt.subplot(1, 2, 1) 53 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 54 | plt.colorbar() 55 | plt.title("Groundtruth reward") 56 | plt.subplot(1, 2, 2) 57 | plt.pcolor(r.reshape((grid_size, grid_size))) 58 | plt.colorbar() 59 | plt.title("Recovered reward") 60 | plt.show() 61 | 62 | if __name__ == '__main__': 63 | main(10, 0.9, 15, 2, 20, 50, 0.01, (3, 3)) 64 | -------------------------------------------------------------------------------- /irl/value_iteration.py: -------------------------------------------------------------------------------- 1 | """ 2 | Find the value function associated with a policy. Based on Sutton & Barto, 1998. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | 10 | def value(policy, n_states, transition_probabilities, reward, discount, 11 | threshold=1e-2): 12 | """ 13 | Find the value function associated with a policy. 14 | 15 | policy: List of action ints for each state. 16 | n_states: Number of states. int. 17 | transition_probabilities: Function taking (state, action, state) to 18 | transition probabilities. 19 | reward: Vector of rewards for each state. 20 | discount: MDP discount factor. float. 21 | threshold: Convergence threshold, default 1e-2. float. 22 | -> Array of values for each state 23 | """ 24 | v = np.zeros(n_states) 25 | 26 | diff = float("inf") 27 | while diff > threshold: 28 | diff = 0 29 | for s in range(n_states): 30 | vs = v[s] 31 | a = policy[s] 32 | v[s] = sum(transition_probabilities[s, a, k] * 33 | (reward[k] + discount * v[k]) 34 | for k in range(n_states)) 35 | diff = max(diff, abs(vs - v[s])) 36 | 37 | return v 38 | 39 | def optimal_value(n_states, n_actions, transition_probabilities, reward, 40 | discount, threshold=1e-2): 41 | """ 42 | Find the optimal value function. 43 | 44 | n_states: Number of states. int. 45 | n_actions: Number of actions. int. 46 | transition_probabilities: Function taking (state, action, state) to 47 | transition probabilities. 48 | reward: Vector of rewards for each state. 49 | discount: MDP discount factor. float. 50 | threshold: Convergence threshold, default 1e-2. float. 51 | -> Array of values for each state 52 | """ 53 | 54 | v = np.zeros(n_states) 55 | 56 | diff = float("inf") 57 | while diff > threshold: 58 | diff = 0 59 | for s in range(n_states): 60 | max_v = float("-inf") 61 | for a in range(n_actions): 62 | tp = transition_probabilities[s, a, :] 63 | max_v = max(max_v, np.dot(tp, reward + discount*v)) 64 | 65 | new_diff = abs(v[s] - max_v) 66 | if new_diff > diff: 67 | diff = new_diff 68 | v[s] = max_v 69 | 70 | return v 71 | 72 | def find_policy(n_states, n_actions, transition_probabilities, reward, discount, 73 | threshold=1e-2, v=None, stochastic=True): 74 | """ 75 | Find the optimal policy. 76 | 77 | n_states: Number of states. int. 78 | n_actions: Number of actions. int. 79 | transition_probabilities: Function taking (state, action, state) to 80 | transition probabilities. 81 | reward: Vector of rewards for each state. 82 | discount: MDP discount factor. float. 83 | threshold: Convergence threshold, default 1e-2. float. 84 | v: Value function (if known). Default None. 85 | stochastic: Whether the policy should be stochastic. Default True. 86 | -> Action probabilities for each state or action int for each state 87 | (depending on stochasticity). 88 | """ 89 | 90 | if v is None: 91 | v = optimal_value(n_states, n_actions, transition_probabilities, reward, 92 | discount, threshold) 93 | 94 | if stochastic: 95 | # Get Q using equation 9.2 from Ziebart's thesis. 96 | Q = np.zeros((n_states, n_actions)) 97 | for i in range(n_states): 98 | for j in range(n_actions): 99 | p = transition_probabilities[i, j, :] 100 | Q[i, j] = p.dot(reward + discount*v) 101 | Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability. 102 | Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1)) 103 | return Q 104 | 105 | def _policy(s): 106 | return max(range(n_actions), 107 | key=lambda a: sum(transition_probabilities[s, a, k] * 108 | (reward[k] + discount * v[k]) 109 | for k in range(n_states))) 110 | policy = np.array([_policy(s) for s in range(n_states)]) 111 | return policy 112 | 113 | if __name__ == '__main__': 114 | # Quick unit test using gridworld. 115 | import mdp.gridworld as gridworld 116 | gw = gridworld.Gridworld(3, 0.3, 0.9) 117 | v = value([gw.optimal_policy_deterministic(s) for s in range(gw.n_states)], 118 | gw.n_states, 119 | gw.transition_probability, 120 | [gw.reward(s) for s in range(gw.n_states)], 121 | gw.discount) 122 | assert np.isclose(v, 123 | [5.7194282, 6.46706692, 6.42589811, 124 | 6.46706692, 7.47058224, 7.96505174, 125 | 6.42589811, 7.96505174, 8.19268666], 1).all() 126 | opt_v = optimal_value(gw.n_states, 127 | gw.n_actions, 128 | gw.transition_probability, 129 | [gw.reward(s) for s in range(gw.n_states)], 130 | gw.discount) 131 | assert np.isclose(v, opt_v).all() 132 | -------------------------------------------------------------------------------- /irl/mdp/objectworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements the objectworld MDP described in Levine et al. 2011. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import math 9 | from itertools import product 10 | 11 | import numpy as np 12 | import numpy.random as rn 13 | 14 | from .gridworld import Gridworld 15 | 16 | class OWObject(object): 17 | """ 18 | Object in objectworld. 19 | """ 20 | 21 | def __init__(self, inner_colour, outer_colour): 22 | """ 23 | inner_colour: Inner colour of object. int. 24 | outer_colour: Outer colour of object. int. 25 | -> OWObject 26 | """ 27 | 28 | self.inner_colour = inner_colour 29 | self.outer_colour = outer_colour 30 | 31 | def __str__(self): 32 | """ 33 | A string representation of this object. 34 | 35 | -> __str__ 36 | """ 37 | 38 | return "".format(self.inner_colour, 39 | self.outer_colour) 40 | 41 | class Objectworld(Gridworld): 42 | """ 43 | Objectworld MDP. 44 | """ 45 | 46 | def __init__(self, grid_size, n_objects, n_colours, wind, discount): 47 | """ 48 | grid_size: Grid size. int. 49 | n_objects: Number of objects in the world. int. 50 | n_colours: Number of colours to colour objects with. int. 51 | wind: Chance of moving randomly. float. 52 | discount: MDP discount. float. 53 | -> Objectworld 54 | """ 55 | 56 | super().__init__(grid_size, wind, discount) 57 | 58 | self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1), (0, 0)) 59 | self.n_actions = len(self.actions) 60 | self.n_objects = n_objects 61 | self.n_colours = n_colours 62 | 63 | # Generate objects. 64 | self.objects = {} 65 | for _ in range(self.n_objects): 66 | obj = OWObject(rn.randint(self.n_colours), 67 | rn.randint(self.n_colours)) 68 | 69 | while True: 70 | x = rn.randint(self.grid_size) 71 | y = rn.randint(self.grid_size) 72 | 73 | if (x, y) not in self.objects: 74 | break 75 | 76 | self.objects[x, y] = obj 77 | 78 | # Preconstruct the transition probability array. 79 | self.transition_probability = np.array( 80 | [[[self._transition_probability(i, j, k) 81 | for k in range(self.n_states)] 82 | for j in range(self.n_actions)] 83 | for i in range(self.n_states)]) 84 | 85 | def feature_vector(self, i, discrete=True): 86 | """ 87 | Get the feature vector associated with a state integer. 88 | 89 | i: State int. 90 | discrete: Whether the feature vectors should be discrete (default True). 91 | bool. 92 | -> Feature vector. 93 | """ 94 | 95 | sx, sy = self.int_to_point(i) 96 | 97 | nearest_inner = {} # colour: distance 98 | nearest_outer = {} # colour: distance 99 | 100 | for y in range(self.grid_size): 101 | for x in range(self.grid_size): 102 | if (x, y) in self.objects: 103 | dist = math.hypot((x - sx), (y - sy)) 104 | obj = self.objects[x, y] 105 | if obj.inner_colour in nearest_inner: 106 | if dist < nearest_inner[obj.inner_colour]: 107 | nearest_inner[obj.inner_colour] = dist 108 | else: 109 | nearest_inner[obj.inner_colour] = dist 110 | if obj.outer_colour in nearest_outer: 111 | if dist < nearest_outer[obj.outer_colour]: 112 | nearest_outer[obj.outer_colour] = dist 113 | else: 114 | nearest_outer[obj.outer_colour] = dist 115 | 116 | # Need to ensure that all colours are represented. 117 | for c in range(self.n_colours): 118 | if c not in nearest_inner: 119 | nearest_inner[c] = 0 120 | if c not in nearest_outer: 121 | nearest_outer[c] = 0 122 | 123 | if discrete: 124 | state = np.zeros((2*self.n_colours*self.grid_size,)) 125 | i = 0 126 | for c in range(self.n_colours): 127 | for d in range(1, self.grid_size+1): 128 | if nearest_inner[c] < d: 129 | state[i] = 1 130 | i += 1 131 | if nearest_outer[c] < d: 132 | state[i] = 1 133 | i += 1 134 | assert i == 2*self.n_colours*self.grid_size 135 | assert (state >= 0).all() 136 | else: 137 | # Continuous features. 138 | state = np.zeros((2*self.n_colours)) 139 | i = 0 140 | for c in range(self.n_colours): 141 | state[i] = nearest_inner[c] 142 | i += 1 143 | state[i] = nearest_outer[c] 144 | i += 1 145 | 146 | return state 147 | 148 | def feature_matrix(self, discrete=True): 149 | """ 150 | Get the feature matrix for this objectworld. 151 | 152 | discrete: Whether the feature vectors should be discrete (default True). 153 | bool. 154 | -> NumPy array with shape (n_states, n_states). 155 | """ 156 | 157 | return np.array([self.feature_vector(i, discrete) 158 | for i in range(self.n_states)]) 159 | 160 | def reward(self, state_int): 161 | """ 162 | Get the reward for a state int. 163 | 164 | state_int: State int. 165 | -> reward float 166 | """ 167 | 168 | x, y = self.int_to_point(state_int) 169 | 170 | near_c0 = False 171 | near_c1 = False 172 | for (dx, dy) in product(range(-3, 4), range(-3, 4)): 173 | if 0 <= x + dx < self.grid_size and 0 <= y + dy < self.grid_size: 174 | if (abs(dx) + abs(dy) <= 3 and 175 | (x+dx, y+dy) in self.objects and 176 | self.objects[x+dx, y+dy].outer_colour == 0): 177 | near_c0 = True 178 | if (abs(dx) + abs(dy) <= 2 and 179 | (x+dx, y+dy) in self.objects and 180 | self.objects[x+dx, y+dy].outer_colour == 1): 181 | near_c1 = True 182 | 183 | if near_c0 and near_c1: 184 | return 1 185 | if near_c0: 186 | return -1 187 | return 0 188 | 189 | def generate_trajectories(self, n_trajectories, trajectory_length, policy): 190 | """ 191 | Generate n_trajectories trajectories with length trajectory_length. 192 | 193 | n_trajectories: Number of trajectories. int. 194 | trajectory_length: Length of an episode. int. 195 | policy: Map from state integers to action integers. 196 | -> [[(state int, action int, reward float)]] 197 | """ 198 | 199 | return super().generate_trajectories(n_trajectories, trajectory_length, 200 | policy, 201 | True) 202 | 203 | def optimal_policy(self, state_int): 204 | raise NotImplementedError( 205 | "Optimal policy is not implemented for Objectworld.") 206 | def optimal_policy_deterministic(self, state_int): 207 | raise NotImplementedError( 208 | "Optimal policy is not implemented for Objectworld.") 209 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Inverse Reinforcement Learning 2 | 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.555999.svg)](https://doi.org/10.5281/zenodo.555999) 4 | 5 | Implements selected inverse reinforcement learning (IRL) algorithms as part of COMP3710, supervised by Dr Mayank Daswani and Dr Marcus Hutter. My final report is available [here](https://alger.au/pdfs/irl.pdf) and describes the implemented algorithms. 6 | 7 | If you use this code in your work, you can cite it as follows: 8 | ```bibtex 9 | @misc{alger16, 10 | author = {Matthew Alger}, 11 | title = {Inverse Reinforcement Learning}, 12 | year = 2016, 13 | doi = {10.5281/zenodo.555999}, 14 | url = {https://doi.org/10.5281/zenodo.555999} 15 | } 16 | ``` 17 | 18 | ## Algorithms implemented 19 | 20 | - Linear programming IRL. From Ng & Russell, 2000. Small state space and large state space linear programming IRL. 21 | - Maximum entropy IRL. From Ziebart et al., 2008. 22 | - Deep maximum entropy IRL. From Wulfmeier et al., 2015; original derivation. 23 | 24 | Additionally, the following MDP domains are implemented: 25 | - Gridworld (Sutton, 1998) 26 | - Objectworld (Levine et al., 2011) 27 | 28 | ## Requirements 29 | - NumPy 30 | - SciPy 31 | - CVXOPT 32 | - Theano 33 | - MatPlotLib (for examples) 34 | 35 | ## Module documentation 36 | 37 | Following is a brief list of functions and classes exported by modules. Full documentation is included in the docstrings of each function or class; only functions and classes intended for use outside the module are documented here. 38 | 39 | ### linear_irl 40 | 41 | Implements linear programming inverse reinforcement learning (Ng & Russell, 2000). 42 | 43 | **Functions:** 44 | 45 | - `irl(n_states, n_actions, transition_probability, policy, discount, Rmax, l1)`: Find a reward function with inverse RL. 46 | - `large_inverseRL(value, transition_probability, feature_matrix, n_states, n_actions, policy)`: Find the reward in a large state space. 47 | 48 | ### maxent 49 | 50 | Implements maximum entropy inverse reinforcement learning (Ziebart et al., 2008). 51 | 52 | **Functions:** 53 | 54 | - `irl(feature_matrix, n_actions, discount, transition_probability, trajectories, epochs, learning_rate)`: Find the reward function for the given trajectories. 55 | - `find_svf(feature_matrix, n_actions, discount, transition_probability, trajectories, epochs, learning_rate)`: Find the state visitation frequency from trajectories. 56 | - `find_feature_expectations(feature_matrix, trajectories)`: Find the feature expectations for the given trajectories. This is the average path feature vector. 57 | - `find_expected_svf(n_states, r, n_actions, discount, transition_probability, trajectories)`: Find the expected state visitation frequencies using algorithm 1 from Ziebart et al. 2008. 58 | - `expected_value_difference(n_states, n_actions, transition_probability, reward, discount, p_start_state, optimal_value, true_reward)`: Calculate the expected value difference, which is a proxy to how good a recovered reward function is. 59 | 60 | ### deep_maxent 61 | 62 | Implements deep maximum entropy inverse reinforcement learning based on Ziebart et al., 2008 and Wulfmeier et al., 2015, using symbolic methods with Theano. 63 | 64 | **Functions:** 65 | 66 | - `irl(structure, feature_matrix, n_actions, discount, transition_probability, trajectories, epochs, learning_rate, initialisation="normal", l1=0.1, l2=0.1)`: Find the reward function for the given trajectories. 67 | - `find_svf(n_states, trajectories)`: Find the state vistiation frequency from trajectories. 68 | - `find_expected_svf(n_states, r, n_actions, discount, transition_probability, trajectories)`: Find the expected state visitation frequencies using algorithm 1 from Ziebart et al. 2008. 69 | 70 | ### value_iteration 71 | 72 | Find the value function associated with a policy. Based on Sutton & Barto, 1998. 73 | 74 | **Functions:** 75 | 76 | - `value(policy, n_states, transition_probabilities, reward, discount, threshold=1e-2)`: Find the value function associated with a policy. 77 | - `optimal_value(n_states, n_actions, transition_probabilities, reward, discount, threshold=1e-2)`: Find the optimal value function. 78 | - `find_policy(n_states, n_actions, transition_probabilities, reward, discount, threshold=1e-2, v=None, stochastic=True)`: Find the optimal policy. 79 | 80 | ### mdp 81 | 82 | #### gridworld 83 | 84 | Implements the gridworld MDP. 85 | 86 | **Classes, instance attributes, methods:** 87 | 88 | - `Gridworld(grid_size, wind, discount)`: Gridworld MDP. 89 | - `actions`: Tuple of (dx, dy) actions. 90 | - `n_actions`: Number of actions. int. 91 | - `n_states`: Number of states. int. 92 | - `grid_size`: Size of grid. int. 93 | - `wind`: Chance of moving randomly. float. 94 | - `discount`: MDP discount factor. float. 95 | - `transition_probability`: NumPy array with shape (n_states, n_actions, n_states) where `transition_probability[si, a, sk]` is the probability of transitioning from state si to state sk under action a. 96 | - `feature_vector(i, feature_map="ident")`: Get the feature vector associated with a state integer. 97 | - `feature_matrix(feature_map="ident")`: Get the feature matrix for this gridworld. 98 | - `int_to_point(i)`: Convert a state int into the corresponding coordinate. 99 | - `point_to_int(p)`: Convert a coordinate into the corresponding state int. 100 | - `neighbouring(i, k)`: Get whether two points neighbour each other. Also returns true if they are the same point. 101 | - `reward(state_int)`: Reward for being in state state_int. 102 | - `average_reward(n_trajectories, trajectory_length, policy)`: Calculate the average total reward obtained by following a given policy over n_paths paths. 103 | - `optimal_policy(state_int)`: The optimal policy for this gridworld. 104 | - `optimal_policy_deterministic(state_int)`: Deterministic version of the optimal policy for this gridworld. 105 | - `generate_trajectories(n_trajectories, trajectory_length, policy, random_start=False)`: Generate n_trajectories trajectories with length trajectory_length, following the given policy. 106 | 107 | #### objectworld 108 | 109 | Implements the objectworld MDP described in Levine et al. 2011. 110 | 111 | **Classes, instance attributes, methods:** 112 | 113 | - `OWObject(inner_colour, outer_colour)`: Object in objectworld. 114 | - `inner_colour`: Inner colour of object. int. 115 | - `outer_colour`: Outer colour of object. int. 116 | 117 | - `Objectworld(grid_size, n_objects, n_colours, wind, discount)`: Objectworld MDP. 118 | - `actions`: Tuple of (dx, dy) actions. 119 | - `n_actions`: Number of actions. int. 120 | - `n_states`: Number of states. int. 121 | - `grid_size`: Size of grid. int. 122 | - `n_objects`: Number of objects in the world. int. 123 | - `n_colours`: Number of colours to colour objects with. int. 124 | - `wind`: Chance of moving randomly. float. 125 | - `discount`: MDP discount factor. float. 126 | - `objects`: Set of objects in the world. 127 | - `transition_probability`: NumPy array with shape (n_states, n_actions, n_states) where `transition_probability[si, a, sk]` is the probability of transitioning from state si to state sk under action a. 128 | - `feature_vector(i, discrete=True)`: Get the feature vector associated with a state integer. 129 | - `feature_matrix(discrete=True)`: Get the feature matrix for this gridworld. 130 | - `int_to_point(i)`: Convert a state int into the corresponding coordinate. 131 | - `point_to_int(p)`: Convert a coordinate into the corresponding state int. 132 | - `neighbouring(i, k)`: Get whether two points neighbour each other. Also returns true if they are the same point. 133 | - `reward(state_int)`: Reward for being in state state_int. 134 | - `average_reward(n_trajectories, trajectory_length, policy)`: Calculate the average total reward obtained by following a given policy over n_paths paths. 135 | - `generate_trajectories(n_trajectories, trajectory_length, policy)`: Generate n_trajectories trajectories with length trajectory_length, following the given policy. 136 | -------------------------------------------------------------------------------- /irl/linear_irl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements LP IRL from Ng & Russell, 2000. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import random 9 | 10 | import numpy as np 11 | from cvxopt import matrix, solvers 12 | 13 | def irl(n_states, n_actions, transition_probability, policy, discount, Rmax, 14 | l1): 15 | """ 16 | Find a reward function with inverse RL as described in Ng & Russell, 2000. 17 | 18 | n_states: Number of states. int. 19 | n_actions: Number of actions. int. 20 | transition_probability: NumPy array mapping (state_i, action, state_k) to 21 | the probability of transitioning from state_i to state_k under action. 22 | Shape (N, A, N). 23 | policy: Vector mapping state ints to action ints. Shape (N,). 24 | discount: Discount factor. float. 25 | Rmax: Maximum reward. float. 26 | l1: l1 regularisation. float. 27 | -> Reward vector 28 | """ 29 | 30 | A = set(range(n_actions)) # Set of actions to help manage reordering 31 | # actions. 32 | # The transition policy convention is different here to the rest of the code 33 | # for legacy reasons; here, we reorder axes to fix this. We expect the 34 | # new probabilities to be of the shape (A, N, N). 35 | transition_probability = np.transpose(transition_probability, (1, 0, 2)) 36 | 37 | def T(a, s): 38 | """ 39 | Shorthand for a dot product used a lot in the LP formulation. 40 | """ 41 | 42 | return np.dot(transition_probability[policy[s], s] - 43 | transition_probability[a, s], 44 | np.linalg.inv(np.eye(n_states) - 45 | discount*transition_probability[policy[s]])) 46 | 47 | # This entire function just computes the block matrices used for the LP 48 | # formulation of IRL. 49 | 50 | # Minimise c . x. 51 | c = -np.hstack([np.zeros(n_states), np.ones(n_states), 52 | -l1*np.ones(n_states)]) 53 | zero_stack1 = np.zeros((n_states*(n_actions-1), n_states)) 54 | T_stack = np.vstack([ 55 | -T(a, s) 56 | for s in range(n_states) 57 | for a in A - {policy[s]} 58 | ]) 59 | I_stack1 = np.vstack([ 60 | np.eye(1, n_states, s) 61 | for s in range(n_states) 62 | for a in A - {policy[s]} 63 | ]) 64 | I_stack2 = np.eye(n_states) 65 | zero_stack2 = np.zeros((n_states, n_states)) 66 | 67 | D_left = np.vstack([T_stack, T_stack, -I_stack2, I_stack2]) 68 | D_middle = np.vstack([I_stack1, zero_stack1, zero_stack2, zero_stack2]) 69 | D_right = np.vstack([zero_stack1, zero_stack1, -I_stack2, -I_stack2]) 70 | 71 | D = np.hstack([D_left, D_middle, D_right]) 72 | b = np.zeros((n_states*(n_actions-1)*2 + 2*n_states, 1)) 73 | bounds = np.array([(None, None)]*2*n_states + [(-Rmax, Rmax)]*n_states) 74 | 75 | # We still need to bound R. To do this, we just add 76 | # -I R <= Rmax 1 77 | # I R <= Rmax 1 78 | # So to D we need to add -I and I, and to b we need to add Rmax 1 and Rmax 1 79 | D_bounds = np.hstack([ 80 | np.vstack([ 81 | -np.eye(n_states), 82 | np.eye(n_states)]), 83 | np.vstack([ 84 | np.zeros((n_states, n_states)), 85 | np.zeros((n_states, n_states))]), 86 | np.vstack([ 87 | np.zeros((n_states, n_states)), 88 | np.zeros((n_states, n_states))])]) 89 | b_bounds = np.vstack([Rmax*np.ones((n_states, 1))]*2) 90 | D = np.vstack((D, D_bounds)) 91 | b = np.vstack((b, b_bounds)) 92 | A_ub = matrix(D) 93 | b = matrix(b) 94 | c = matrix(c) 95 | results = solvers.lp(c, A_ub, b) 96 | r = np.asarray(results["x"][:n_states], dtype=np.double) 97 | 98 | return r.reshape((n_states,)) 99 | 100 | def v_tensor(value, transition_probability, feature_dimension, n_states, 101 | n_actions, policy): 102 | """ 103 | Finds the v tensor used in large linear IRL. 104 | 105 | value: NumPy matrix for the value function. The (i, j)th component 106 | represents the value of the jth state under the ith basis function. 107 | transition_probability: NumPy array mapping (state_i, action, state_k) to 108 | the probability of transitioning from state_i to state_k under action. 109 | Shape (N, A, N). 110 | feature_dimension: Dimension of the feature matrix. int. 111 | n_states: Number of states sampled. int. 112 | n_actions: Number of actions. int. 113 | policy: NumPy array mapping state ints to action ints. 114 | -> v helper tensor. 115 | """ 116 | 117 | v = np.zeros((n_states, n_actions-1, feature_dimension)) 118 | for i in range(n_states): 119 | a1 = policy[i] 120 | exp_on_policy = np.dot(transition_probability[i, a1], value.T) 121 | seen_policy_action = False 122 | for j in range(n_actions): 123 | # Skip this if it's the on-policy action. 124 | if a1 == j: 125 | seen_policy_action = True 126 | continue 127 | 128 | exp_off_policy = np.dot(transition_probability[i, j], value.T) 129 | if seen_policy_action: 130 | v[i, j-1] = exp_on_policy - exp_off_policy 131 | else: 132 | v[i, j] = exp_on_policy - exp_off_policy 133 | return v 134 | 135 | def large_irl(value, transition_probability, feature_matrix, n_states, 136 | n_actions, policy): 137 | """ 138 | Find the reward in a large state space. 139 | 140 | value: NumPy matrix for the value function. The (i, j)th component 141 | represents the value of the jth state under the ith basis function. 142 | transition_probability: NumPy array mapping (state_i, action, state_k) to 143 | the probability of transitioning from state_i to state_k under action. 144 | Shape (N, A, N). 145 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 146 | array with shape (N, D) where N is the number of states and D is the 147 | dimensionality of the state. 148 | n_states: Number of states sampled. int. 149 | n_actions: Number of actions. int. 150 | policy: NumPy array mapping state ints to action ints. 151 | -> Reward for each state in states. 152 | """ 153 | 154 | D = feature_matrix.shape[1] 155 | 156 | # First, calculate v, which is just a helper tensor. 157 | v = v_tensor(value, transition_probability, D, n_states, n_actions, policy) 158 | 159 | # Now we can calculate c, G, h, A, and b. 160 | 161 | # x = [z y_i^+ y_i^- a], which is a [N (K-1)*N (K-1)*N D] vector. 162 | x_size = n_states + (n_actions-1)*n_states*2 + D 163 | 164 | # c is a big stack of ones and zeros; there's N ones and the rest is zero. 165 | c = -np.hstack([np.ones(n_states), np.zeros(x_size - n_states)]) 166 | assert c.shape[0] == x_size 167 | 168 | # A is [0 I_j -I_j -v^T_{ij}] and j NOT EQUAL TO policy(i). 169 | # I believe this is accounted for by the structure of v. 170 | A = np.hstack([ 171 | np.zeros((n_states*(n_actions-1), n_states)), 172 | np.eye(n_states*(n_actions-1)), 173 | -np.eye(n_states*(n_actions-1)), 174 | np.vstack([v[i, j].T for i in range(n_states) 175 | for j in range(n_actions-1)])]) 176 | assert A.shape[1] == x_size 177 | 178 | # b is just zeros! 179 | b = np.zeros(A.shape[0]) 180 | 181 | # Break G up into the bottom row and other rows to construct it. 182 | bottom_row = np.vstack([ 183 | np.hstack([ 184 | np.ones((n_actions-1, 1)).dot(np.eye(1, n_states, l)), 185 | np.hstack([-np.eye(n_actions-1) if i == l 186 | else np.zeros((n_actions-1, n_actions-1)) 187 | for i in range(n_states)]), 188 | np.hstack([2*np.eye(n_actions-1) if i == l 189 | else np.zeros((n_actions-1, n_actions-1)) 190 | for i in range(n_states)]), 191 | np.zeros((n_actions-1, D))]) 192 | for l in range(n_states)]) 193 | assert bottom_row.shape[1] == x_size 194 | G = np.vstack([ 195 | np.hstack([ 196 | np.zeros((D, n_states)), 197 | np.zeros((D, n_states*(n_actions-1))), 198 | np.zeros((D, n_states*(n_actions-1))), 199 | np.eye(D)]), 200 | np.hstack([ 201 | np.zeros((D, n_states)), 202 | np.zeros((D, n_states*(n_actions-1))), 203 | np.zeros((D, n_states*(n_actions-1))), 204 | -np.eye(D)]), 205 | np.hstack([ 206 | np.zeros((n_states*(n_actions-1), n_states)), 207 | -np.eye(n_states*(n_actions-1)), 208 | np.zeros((n_states*(n_actions-1), n_states*(n_actions-1))), 209 | np.zeros((n_states*(n_actions-1), D))]), 210 | np.hstack([ 211 | np.zeros((n_states*(n_actions-1), n_states)), 212 | np.zeros((n_states*(n_actions-1), n_states*(n_actions-1))), 213 | -np.eye(n_states*(n_actions-1)), 214 | np.zeros((n_states*(n_actions-1), D))]), 215 | bottom_row]) 216 | assert G.shape[1] == x_size 217 | 218 | h = np.vstack([np.ones((D*2, 1)), 219 | np.zeros((n_states*(n_actions-1)*2+bottom_row.shape[0], 1))]) 220 | 221 | from cvxopt import matrix, solvers 222 | c = matrix(c) 223 | G = matrix(G) 224 | h = matrix(h) 225 | A = matrix(A) 226 | b = matrix(b) 227 | results = solvers.lp(c, G, h, A, b) 228 | alpha = np.asarray(results["x"][-D:], dtype=np.double) 229 | return np.dot(feature_matrix, -alpha) 230 | -------------------------------------------------------------------------------- /irl/maxent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements maximum entropy inverse reinforcement learning (Ziebart et al., 2008) 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | from itertools import product 9 | 10 | import numpy as np 11 | import numpy.random as rn 12 | 13 | from . import value_iteration 14 | 15 | def irl(feature_matrix, n_actions, discount, transition_probability, 16 | trajectories, epochs, learning_rate): 17 | """ 18 | Find the reward function for the given trajectories. 19 | 20 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 21 | array with shape (N, D) where N is the number of states and D is the 22 | dimensionality of the state. 23 | n_actions: Number of actions A. int. 24 | discount: Discount factor of the MDP. float. 25 | transition_probability: NumPy array mapping (state_i, action, state_k) to 26 | the probability of transitioning from state_i to state_k under action. 27 | Shape (N, A, N). 28 | trajectories: 3D array of state/action pairs. States are ints, actions 29 | are ints. NumPy array with shape (T, L, 2) where T is the number of 30 | trajectories and L is the trajectory length. 31 | epochs: Number of gradient descent steps. int. 32 | learning_rate: Gradient descent learning rate. float. 33 | -> Reward vector with shape (N,). 34 | """ 35 | 36 | n_states, d_states = feature_matrix.shape 37 | 38 | # Initialise weights. 39 | alpha = rn.uniform(size=(d_states,)) 40 | 41 | # Calculate the feature expectations \tilde{phi}. 42 | feature_expectations = find_feature_expectations(feature_matrix, 43 | trajectories) 44 | 45 | # Gradient descent on alpha. 46 | for i in range(epochs): 47 | # print("i: {}".format(i)) 48 | r = feature_matrix.dot(alpha) 49 | expected_svf = find_expected_svf(n_states, r, n_actions, discount, 50 | transition_probability, trajectories) 51 | grad = feature_expectations - feature_matrix.T.dot(expected_svf) 52 | 53 | alpha += learning_rate * grad 54 | 55 | return feature_matrix.dot(alpha).reshape((n_states,)) 56 | 57 | def find_svf(n_states, trajectories): 58 | """ 59 | Find the state visitation frequency from trajectories. 60 | 61 | n_states: Number of states. int. 62 | trajectories: 3D array of state/action pairs. States are ints, actions 63 | are ints. NumPy array with shape (T, L, 2) where T is the number of 64 | trajectories and L is the trajectory length. 65 | -> State visitation frequencies vector with shape (N,). 66 | """ 67 | 68 | svf = np.zeros(n_states) 69 | 70 | for trajectory in trajectories: 71 | for state, _, _ in trajectory: 72 | svf[state] += 1 73 | 74 | svf /= trajectories.shape[0] 75 | 76 | return svf 77 | 78 | def find_feature_expectations(feature_matrix, trajectories): 79 | """ 80 | Find the feature expectations for the given trajectories. This is the 81 | average path feature vector. 82 | 83 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 84 | array with shape (N, D) where N is the number of states and D is the 85 | dimensionality of the state. 86 | trajectories: 3D array of state/action pairs. States are ints, actions 87 | are ints. NumPy array with shape (T, L, 2) where T is the number of 88 | trajectories and L is the trajectory length. 89 | -> Feature expectations vector with shape (D,). 90 | """ 91 | 92 | feature_expectations = np.zeros(feature_matrix.shape[1]) 93 | 94 | for trajectory in trajectories: 95 | for state, _, _ in trajectory: 96 | feature_expectations += feature_matrix[state] 97 | 98 | feature_expectations /= trajectories.shape[0] 99 | 100 | return feature_expectations 101 | 102 | def find_expected_svf(n_states, r, n_actions, discount, 103 | transition_probability, trajectories): 104 | """ 105 | Find the expected state visitation frequencies using algorithm 1 from 106 | Ziebart et al. 2008. 107 | 108 | n_states: Number of states N. int. 109 | alpha: Reward. NumPy array with shape (N,). 110 | n_actions: Number of actions A. int. 111 | discount: Discount factor of the MDP. float. 112 | transition_probability: NumPy array mapping (state_i, action, state_k) to 113 | the probability of transitioning from state_i to state_k under action. 114 | Shape (N, A, N). 115 | trajectories: 3D array of state/action pairs. States are ints, actions 116 | are ints. NumPy array with shape (T, L, 2) where T is the number of 117 | trajectories and L is the trajectory length. 118 | -> Expected state visitation frequencies vector with shape (N,). 119 | """ 120 | 121 | n_trajectories = trajectories.shape[0] 122 | trajectory_length = trajectories.shape[1] 123 | 124 | # policy = find_policy(n_states, r, n_actions, discount, 125 | # transition_probability) 126 | policy = value_iteration.find_policy(n_states, n_actions, 127 | transition_probability, r, discount) 128 | 129 | start_state_count = np.zeros(n_states) 130 | for trajectory in trajectories: 131 | start_state_count[trajectory[0, 0]] += 1 132 | p_start_state = start_state_count/n_trajectories 133 | 134 | expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T 135 | for t in range(1, trajectory_length): 136 | expected_svf[:, t] = 0 137 | for i, j, k in product(range(n_states), range(n_actions), range(n_states)): 138 | expected_svf[k, t] += (expected_svf[i, t-1] * 139 | policy[i, j] * # Stochastic policy 140 | transition_probability[i, j, k]) 141 | 142 | return expected_svf.sum(axis=1) 143 | 144 | def softmax(x1, x2): 145 | """ 146 | Soft-maximum calculation, from algorithm 9.2 in Ziebart's PhD thesis. 147 | 148 | x1: float. 149 | x2: float. 150 | -> softmax(x1, x2) 151 | """ 152 | 153 | max_x = max(x1, x2) 154 | min_x = min(x1, x2) 155 | return max_x + np.log(1 + np.exp(min_x - max_x)) 156 | 157 | def find_policy(n_states, r, n_actions, discount, 158 | transition_probability): 159 | """ 160 | Find a policy with linear value iteration. Based on the code accompanying 161 | the Levine et al. GPIRL paper and on Ziebart's PhD thesis (algorithm 9.1). 162 | 163 | n_states: Number of states N. int. 164 | r: Reward. NumPy array with shape (N,). 165 | n_actions: Number of actions A. int. 166 | discount: Discount factor of the MDP. float. 167 | transition_probability: NumPy array mapping (state_i, action, state_k) to 168 | the probability of transitioning from state_i to state_k under action. 169 | Shape (N, A, N). 170 | -> NumPy array of states and the probability of taking each action in that 171 | state, with shape (N, A). 172 | """ 173 | 174 | # V = value_iteration.value(n_states, transition_probability, r, discount) 175 | 176 | # NumPy's dot really dislikes using inf, so I'm making everything finite 177 | # using nan_to_num. 178 | V = np.nan_to_num(np.ones((n_states, 1)) * float("-inf")) 179 | 180 | diff = np.ones((n_states,)) 181 | while (diff > 1e-4).all(): # Iterate until convergence. 182 | new_V = r.copy() 183 | for j in range(n_actions): 184 | for i in range(n_states): 185 | new_V[i] = softmax(new_V[i], r[i] + discount* 186 | np.sum(transition_probability[i, j, k] * V[k] 187 | for k in range(n_states))) 188 | 189 | # # This seems to diverge, so we z-score it (engineering hack). 190 | new_V = (new_V - new_V.mean())/new_V.std() 191 | 192 | diff = abs(V - new_V) 193 | V = new_V 194 | 195 | # We really want Q, not V, so grab that using equation 9.2 from the thesis. 196 | Q = np.zeros((n_states, n_actions)) 197 | for i in range(n_states): 198 | for j in range(n_actions): 199 | p = np.array([transition_probability[i, j, k] 200 | for k in range(n_states)]) 201 | Q[i, j] = p.dot(r + discount*V) 202 | 203 | # Softmax by row to interpret these values as probabilities. 204 | Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability. 205 | Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1)) 206 | return Q 207 | 208 | def expected_value_difference(n_states, n_actions, transition_probability, 209 | reward, discount, p_start_state, optimal_value, true_reward): 210 | """ 211 | Calculate the expected value difference, which is a proxy to how good a 212 | recovered reward function is. 213 | 214 | n_states: Number of states. int. 215 | n_actions: Number of actions. int. 216 | transition_probability: NumPy array mapping (state_i, action, state_k) to 217 | the probability of transitioning from state_i to state_k under action. 218 | Shape (N, A, N). 219 | reward: Reward vector mapping state int to reward. Shape (N,). 220 | discount: Discount factor. float. 221 | p_start_state: Probability vector with the ith component as the probability 222 | that the ith state is the start state. Shape (N,). 223 | optimal_value: Value vector for the ground reward with optimal policy. 224 | The ith component is the value of the ith state. Shape (N,). 225 | true_reward: True reward vector. Shape (N,). 226 | -> Expected value difference. float. 227 | """ 228 | 229 | policy = value_iteration.find_policy(n_states, n_actions, 230 | transition_probability, reward, discount) 231 | value = value_iteration.value(policy.argmax(axis=1), n_states, 232 | transition_probability, true_reward, discount) 233 | 234 | evd = optimal_value.dot(p_start_state) - value.dot(p_start_state) 235 | return evd 236 | -------------------------------------------------------------------------------- /irl/mdp/gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements the gridworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import numpy.random as rn 10 | 11 | class Gridworld(object): 12 | """ 13 | Gridworld MDP. 14 | """ 15 | 16 | def __init__(self, grid_size, wind, discount): 17 | """ 18 | grid_size: Grid size. int. 19 | wind: Chance of moving randomly. float. 20 | discount: MDP discount. float. 21 | -> Gridworld 22 | """ 23 | 24 | self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1)) 25 | self.n_actions = len(self.actions) 26 | self.n_states = grid_size**2 27 | self.grid_size = grid_size 28 | self.wind = wind 29 | self.discount = discount 30 | 31 | # Preconstruct the transition probability array. 32 | self.transition_probability = np.array( 33 | [[[self._transition_probability(i, j, k) 34 | for k in range(self.n_states)] 35 | for j in range(self.n_actions)] 36 | for i in range(self.n_states)]) 37 | 38 | def __str__(self): 39 | return "Gridworld({}, {}, {})".format(self.grid_size, self.wind, 40 | self.discount) 41 | 42 | def feature_vector(self, i, feature_map="ident"): 43 | """ 44 | Get the feature vector associated with a state integer. 45 | 46 | i: State int. 47 | feature_map: Which feature map to use (default ident). String in {ident, 48 | coord, proxi}. 49 | -> Feature vector. 50 | """ 51 | 52 | if feature_map == "coord": 53 | f = np.zeros(self.grid_size) 54 | x, y = i % self.grid_size, i // self.grid_size 55 | f[x] += 1 56 | f[y] += 1 57 | return f 58 | if feature_map == "proxi": 59 | f = np.zeros(self.n_states) 60 | x, y = i % self.grid_size, i // self.grid_size 61 | for b in range(self.grid_size): 62 | for a in range(self.grid_size): 63 | dist = abs(x - a) + abs(y - b) 64 | f[self.point_to_int((a, b))] = dist 65 | return f 66 | # Assume identity map. 67 | f = np.zeros(self.n_states) 68 | f[i] = 1 69 | return f 70 | 71 | def feature_matrix(self, feature_map="ident"): 72 | """ 73 | Get the feature matrix for this gridworld. 74 | 75 | feature_map: Which feature map to use (default ident). String in {ident, 76 | coord, proxi}. 77 | -> NumPy array with shape (n_states, d_states). 78 | """ 79 | 80 | features = [] 81 | for n in range(self.n_states): 82 | f = self.feature_vector(n, feature_map) 83 | features.append(f) 84 | return np.array(features) 85 | 86 | def int_to_point(self, i): 87 | """ 88 | Convert a state int into the corresponding coordinate. 89 | 90 | i: State int. 91 | -> (x, y) int tuple. 92 | """ 93 | 94 | return (i % self.grid_size, i // self.grid_size) 95 | 96 | def point_to_int(self, p): 97 | """ 98 | Convert a coordinate into the corresponding state int. 99 | 100 | p: (x, y) tuple. 101 | -> State int. 102 | """ 103 | 104 | return p[0] + p[1]*self.grid_size 105 | 106 | def neighbouring(self, i, k): 107 | """ 108 | Get whether two points neighbour each other. Also returns true if they 109 | are the same point. 110 | 111 | i: (x, y) int tuple. 112 | k: (x, y) int tuple. 113 | -> bool. 114 | """ 115 | 116 | return abs(i[0] - k[0]) + abs(i[1] - k[1]) <= 1 117 | 118 | def _transition_probability(self, i, j, k): 119 | """ 120 | Get the probability of transitioning from state i to state k given 121 | action j. 122 | 123 | i: State int. 124 | j: Action int. 125 | k: State int. 126 | -> p(s_k | s_i, a_j) 127 | """ 128 | 129 | xi, yi = self.int_to_point(i) 130 | xj, yj = self.actions[j] 131 | xk, yk = self.int_to_point(k) 132 | 133 | if not self.neighbouring((xi, yi), (xk, yk)): 134 | return 0.0 135 | 136 | # Is k the intended state to move to? 137 | if (xi + xj, yi + yj) == (xk, yk): 138 | return 1 - self.wind + self.wind/self.n_actions 139 | 140 | # If these are not the same point, then we can move there by wind. 141 | if (xi, yi) != (xk, yk): 142 | return self.wind/self.n_actions 143 | 144 | # If these are the same point, we can only move here by either moving 145 | # off the grid or being blown off the grid. Are we on a corner or not? 146 | if (xi, yi) in {(0, 0), (self.grid_size-1, self.grid_size-1), 147 | (0, self.grid_size-1), (self.grid_size-1, 0)}: 148 | # Corner. 149 | # Can move off the edge in two directions. 150 | # Did we intend to move off the grid? 151 | if not (0 <= xi + xj < self.grid_size and 152 | 0 <= yi + yj < self.grid_size): 153 | # We intended to move off the grid, so we have the regular 154 | # success chance of staying here plus an extra chance of blowing 155 | # onto the *other* off-grid square. 156 | return 1 - self.wind + 2*self.wind/self.n_actions 157 | else: 158 | # We can blow off the grid in either direction only by wind. 159 | return 2*self.wind/self.n_actions 160 | else: 161 | # Not a corner. Is it an edge? 162 | if (xi not in {0, self.grid_size-1} and 163 | yi not in {0, self.grid_size-1}): 164 | # Not an edge. 165 | return 0.0 166 | 167 | # Edge. 168 | # Can only move off the edge in one direction. 169 | # Did we intend to move off the grid? 170 | if not (0 <= xi + xj < self.grid_size and 171 | 0 <= yi + yj < self.grid_size): 172 | # We intended to move off the grid, so we have the regular 173 | # success chance of staying here. 174 | return 1 - self.wind + self.wind/self.n_actions 175 | else: 176 | # We can blow off the grid only by wind. 177 | return self.wind/self.n_actions 178 | 179 | def reward(self, state_int): 180 | """ 181 | Reward for being in state state_int. 182 | 183 | state_int: State integer. int. 184 | -> Reward. 185 | """ 186 | 187 | if state_int == self.n_states - 1: 188 | return 1 189 | return 0 190 | 191 | def average_reward(self, n_trajectories, trajectory_length, policy): 192 | """ 193 | Calculate the average total reward obtained by following a given policy 194 | over n_paths paths. 195 | 196 | policy: Map from state integers to action integers. 197 | n_trajectories: Number of trajectories. int. 198 | trajectory_length: Length of an episode. int. 199 | -> Average reward, standard deviation. 200 | """ 201 | 202 | trajectories = self.generate_trajectories(n_trajectories, 203 | trajectory_length, policy) 204 | rewards = [[r for _, _, r in trajectory] for trajectory in trajectories] 205 | rewards = np.array(rewards) 206 | 207 | # Add up all the rewards to find the total reward. 208 | total_reward = rewards.sum(axis=1) 209 | 210 | # Return the average reward and standard deviation. 211 | return total_reward.mean(), total_reward.std() 212 | 213 | def optimal_policy(self, state_int): 214 | """ 215 | The optimal policy for this gridworld. 216 | 217 | state_int: What state we are in. int. 218 | -> Action int. 219 | """ 220 | 221 | sx, sy = self.int_to_point(state_int) 222 | 223 | if sx < self.grid_size and sy < self.grid_size: 224 | return rn.randint(0, 2) 225 | if sx < self.grid_size-1: 226 | return 0 227 | if sy < self.grid_size-1: 228 | return 1 229 | raise ValueError("Unexpected state.") 230 | 231 | def optimal_policy_deterministic(self, state_int): 232 | """ 233 | Deterministic version of the optimal policy for this gridworld. 234 | 235 | state_int: What state we are in. int. 236 | -> Action int. 237 | """ 238 | 239 | sx, sy = self.int_to_point(state_int) 240 | if sx < sy: 241 | return 0 242 | return 1 243 | 244 | def generate_trajectories(self, n_trajectories, trajectory_length, policy, 245 | random_start=False): 246 | """ 247 | Generate n_trajectories trajectories with length trajectory_length, 248 | following the given policy. 249 | 250 | n_trajectories: Number of trajectories. int. 251 | trajectory_length: Length of an episode. int. 252 | policy: Map from state integers to action integers. 253 | random_start: Whether to start randomly (default False). bool. 254 | -> [[(state int, action int, reward float)]] 255 | """ 256 | 257 | trajectories = [] 258 | for _ in range(n_trajectories): 259 | if random_start: 260 | sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size) 261 | else: 262 | sx, sy = 0, 0 263 | 264 | trajectory = [] 265 | for _ in range(trajectory_length): 266 | if rn.random() < self.wind: 267 | action = self.actions[rn.randint(0, 4)] 268 | else: 269 | # Follow the given policy. 270 | action = self.actions[policy(self.point_to_int((sx, sy)))] 271 | 272 | if (0 <= sx + action[0] < self.grid_size and 273 | 0 <= sy + action[1] < self.grid_size): 274 | next_sx = sx + action[0] 275 | next_sy = sy + action[1] 276 | else: 277 | next_sx = sx 278 | next_sy = sy 279 | 280 | state_int = self.point_to_int((sx, sy)) 281 | action_int = self.actions.index(action) 282 | next_state_int = self.point_to_int((next_sx, next_sy)) 283 | reward = self.reward(next_state_int) 284 | trajectory.append((state_int, action_int, reward)) 285 | 286 | sx = next_sx 287 | sy = next_sy 288 | 289 | trajectories.append(trajectory) 290 | 291 | return np.array(trajectories) 292 | -------------------------------------------------------------------------------- /irl/deep_maxent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements deep maximum entropy inverse reinforcement learning based on 3 | Ziebart et al., 2008 and Wulfmeier et al., 2015, using symbolic methods with 4 | Theano. 5 | 6 | Matthew Alger, 2015 7 | matthew.alger@anu.edu.au 8 | """ 9 | 10 | from itertools import product 11 | 12 | import numpy as np 13 | import numpy.random as rn 14 | import theano as th 15 | import theano.tensor as T 16 | 17 | from . import maxent 18 | 19 | FLOAT = th.config.floatX 20 | 21 | def find_svf(n_states, trajectories): 22 | """ 23 | Find the state vistiation frequency from trajectories. 24 | 25 | n_states: Number of states. int. 26 | trajectories: 3D array of state/action pairs. States are ints, actions 27 | are ints. NumPy array with shape (T, L, 2) where T is the number of 28 | trajectories and L is the trajectory length. 29 | -> State visitation frequencies vector with shape (N,). 30 | """ 31 | 32 | svf = np.zeros(n_states) 33 | 34 | for trajectory in trajectories: 35 | for state, _, _ in trajectory: 36 | svf[state] += 1 37 | 38 | svf /= trajectories.shape[0] 39 | 40 | return th.shared(svf, "svf", borrow=True) 41 | 42 | def optimal_value(n_states, n_actions, transition_probabilities, reward, 43 | discount, threshold=1e-2): 44 | """ 45 | Find the optimal value function. 46 | 47 | n_states: Number of states. int. 48 | n_actions: Number of actions. int. 49 | transition_probabilities: Function taking (state, action, state) to 50 | transition probabilities. 51 | reward: Vector of rewards for each state. 52 | discount: MDP discount factor. float. 53 | threshold: Convergence threshold, default 1e-2. float. 54 | -> Array of values for each state 55 | """ 56 | 57 | v = T.zeros(n_states, dtype=FLOAT) 58 | 59 | def update(s, prev_diff, v, reward, tps): 60 | max_v = float("-inf") 61 | v_template = T.zeros_like(v) 62 | for a in range(n_actions): 63 | tp = tps[s, a, :] 64 | max_v = T.largest(max_v, T.dot(tp, reward + discount*v)) 65 | new_diff = abs(v[s] - max_v) 66 | if T.lt(prev_diff, new_diff): 67 | diff = new_diff 68 | else: 69 | diff = prev_diff 70 | return (diff, T.set_subtensor(v_template[s], max_v)), {} 71 | 72 | def until_converged(diff, v): 73 | (diff, vs), _ = th.scan( 74 | fn=update, 75 | outputs_info=[{"initial": diff, "taps": [-1]}, 76 | None], 77 | sequences=[T.arange(n_states)], 78 | non_sequences=[v, reward, transition_probabilities]) 79 | return ((diff[-1], vs.sum(axis=0)), {}, 80 | th.scan_module.until(diff[-1] < threshold)) 81 | 82 | (_, vs), _ = th.scan(fn = until_converged, 83 | outputs_info=[ 84 | # Need to force an inf into the right Theano 85 | # data type and this seems to be the only way that 86 | # works. 87 | {"initial": getattr(np, FLOAT)(float("inf")), 88 | "taps": [-1]}, 89 | {"initial": v, 90 | "taps": [-1]}], 91 | n_steps=1000) 92 | 93 | return vs[-1] 94 | 95 | def find_policy(n_states, n_actions, transition_probabilities, reward, discount, 96 | threshold=1e-2, v=None): 97 | """ 98 | Find the optimal policy. 99 | 100 | n_states: Number of states. int. 101 | n_actions: Number of actions. int. 102 | transition_probabilities: Function taking (state, action, state) to 103 | transition probabilities. 104 | reward: Vector of rewards for each state. 105 | discount: MDP discount factor. float. 106 | threshold: Convergence threshold, default 1e-2. float. 107 | v: Optimal value array (if known). Default None. 108 | -> Action probabilities for each state. 109 | """ 110 | 111 | if v is None: 112 | v = optimal_value(n_states, n_actions, transition_probabilities, reward, 113 | discount, threshold) 114 | 115 | # Get Q using equation 9.2 from Ziebart's thesis. 116 | Q = T.zeros((n_states, n_actions)) 117 | def make_Q(i, j, tps, Q, reward, v): 118 | Q_template = T.zeros_like(Q) 119 | tp = transition_probabilities[i, j, :] 120 | return T.set_subtensor(Q_template[i, j], tp.dot(reward + discount*v)),{} 121 | 122 | prod = np.array(list(product(range(n_states), range(n_actions)))) 123 | state_range = th.shared(prod[:, 0]) 124 | action_range = th.shared(prod[:, 1]) 125 | Qs, _ = th.scan(fn=make_Q, 126 | outputs_info=None, 127 | sequences=[state_range, action_range], 128 | non_sequences=[transition_probabilities, Q, reward, v]) 129 | Q = Qs.sum(axis=0) 130 | Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability. 131 | Q = T.exp(Q)/T.exp(Q).sum(axis=1).reshape((n_states, 1)) 132 | return Q 133 | 134 | def find_expected_svf(n_states, r, n_actions, discount, 135 | transition_probability, trajectories): 136 | """ 137 | Find the expected state visitation frequencies using algorithm 1 from 138 | Ziebart et al. 2008. 139 | 140 | n_states: Number of states N. int. 141 | alpha: Reward. NumPy array with shape (N,). 142 | n_actions: Number of actions A. int. 143 | discount: Discount factor of the MDP. float. 144 | transition_probability: NumPy array mapping (state_i, action, state_k) to 145 | the probability of transitioning from state_i to state_k under action. 146 | Shape (N, A, N). 147 | trajectories: 3D array of state/action pairs. States are ints, actions 148 | are ints. NumPy array with shape (T, L, 2) where T is the number of 149 | trajectories and L is the trajectory length. 150 | -> Expected state visitation frequencies vector with shape (N,). 151 | """ 152 | 153 | n_trajectories = trajectories.shape[0] 154 | trajectory_length = trajectories.shape[1] 155 | 156 | policy = find_policy(n_states, n_actions, 157 | transition_probability, r, discount) 158 | 159 | start_state_count = T.extra_ops.bincount(trajectories[:, 0, 0], 160 | minlength=n_states) 161 | p_start_state = start_state_count.astype(FLOAT)/n_trajectories 162 | 163 | def state_visitation_step(i, j, prev_svf, policy, tps): 164 | """ 165 | The sum of the outputs of a scan over this will be a row of the svf. 166 | """ 167 | 168 | svf = prev_svf[i] * policy[i, j] * tps[i, j, :] 169 | return svf, {} 170 | 171 | prod = np.array(list(product(range(n_states), range(n_actions)))) 172 | state_range = th.shared(prod[:, 0]) 173 | action_range = th.shared(prod[:, 1]) 174 | def state_visitation_row(prev_svf, policy, tps, state_range, action_range): 175 | svf_t, _ = th.scan(fn=state_visitation_step, 176 | sequences=[state_range, action_range], 177 | non_sequences=[prev_svf, policy, tps]) 178 | svf_t = svf_t.sum(axis=0) 179 | return svf_t, {} 180 | 181 | svf, _ = th.scan(fn=state_visitation_row, 182 | outputs_info=[{"initial": p_start_state, "taps": [-1]}], 183 | n_steps=trajectories.shape[1]-1, 184 | non_sequences=[policy, transition_probability, state_range, 185 | action_range]) 186 | 187 | return svf.sum(axis=0) + p_start_state 188 | 189 | def irl(structure, feature_matrix, n_actions, discount, transition_probability, 190 | trajectories, epochs, learning_rate, initialisation="normal", l1=0.1, 191 | l2=0.1): 192 | """ 193 | Find the reward function for the given trajectories. 194 | 195 | structure: Neural network structure tuple, e.g. (10, 3, 3) would be a 196 | 3-layer neural network with 10 inputs. 197 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 198 | array with shape (N, D) where N is the number of states and D is the 199 | dimensionality of the state. 200 | n_actions: Number of actions A. int. 201 | discount: Discount factor of the MDP. float. 202 | transition_probability: NumPy array mapping (state_i, action, state_k) to 203 | the probability of transitioning from state_i to state_k under action. 204 | Shape (N, A, N). 205 | trajectories: 3D array of state/action pairs. States are ints, actions 206 | are ints. NumPy array with shape (T, L, 2) where T is the number of 207 | trajectories and L is the trajectory length. 208 | epochs: Number of gradient descent steps. int. 209 | learning_rate: Gradient descent learning rate. float. 210 | initialisation: What distribution to use. str in {normal, uniform}. Default 211 | normal. 212 | l1: L1 regularisation. Default 0.1. float. 213 | l2: L2 regularisation. Default 0.1. float. 214 | -> Reward vector with shape (N,). 215 | """ 216 | 217 | n_states, d_states = feature_matrix.shape 218 | transition_probability = th.shared(transition_probability, borrow=True) 219 | trajectories = th.shared(trajectories, borrow=True) 220 | 221 | # Initialise W matrices; b biases. 222 | n_layers = len(structure)-1 223 | weights = [] 224 | hist_w_grads = [] # For AdaGrad. 225 | biases = [] 226 | hist_b_grads = [] # For AdaGrad. 227 | for i in range(n_layers): 228 | # W 229 | shape = (structure[i+1], structure[i]) 230 | if initialisation == "normal": 231 | matrix = th.shared(rn.normal(size=shape), name="W", borrow=True) 232 | else: 233 | matrix = th.shared(rn.uniform(size=shape), name="W", borrow=True) 234 | weights.append(matrix) 235 | hist_w_grads.append(th.shared(np.zeros(shape), name="hdW", borrow=True)) 236 | 237 | # b 238 | shape = (structure[i+1], 1) 239 | if initialisation == "normal": 240 | matrix = th.shared(rn.normal(size=shape), name="b", borrow=True) 241 | else: 242 | matrix = th.shared(rn.uniform(size=shape), name="b", borrow=True) 243 | biases.append(matrix) 244 | hist_b_grads.append(th.shared(np.zeros(shape), name="hdb", borrow=True)) 245 | 246 | # Initialise α weight, β bias. 247 | if initialisation == "normal": 248 | α = th.shared(rn.normal(size=(1, structure[-1])), name="alpha", 249 | borrow=True) 250 | else: 251 | α = th.shared(rn.uniform(size=(1, structure[-1])), name="alpha", 252 | borrow=True) 253 | hist_α_grad = T.zeros(α.shape) # For AdaGrad. 254 | 255 | adagrad_epsilon = 1e-6 # AdaGrad numerical stability. 256 | 257 | #### Theano symbolic setup. #### 258 | 259 | # Symbolic input. 260 | s_feature_matrix = T.matrix("x") 261 | # Feature matrices. 262 | # All dimensions of the form (d_layer, n_states). 263 | φs = [s_feature_matrix.T] 264 | # Forward propagation. 265 | for W, b in zip(weights, biases): 266 | φ = T.nnet.sigmoid(th.compile.ops.Rebroadcast((0, False), (1, True))(b) 267 | + W.dot(φs[-1])) 268 | φs.append(φ) 269 | # φs[1] = φ1 etc. 270 | # Reward. 271 | r = α.dot(φs[-1]).reshape((n_states,)) 272 | # Engineering hack: z-score the reward. 273 | r = (r - r.mean())/r.std() 274 | # Associated feature expectations. 275 | expected_svf = find_expected_svf(n_states, r, 276 | n_actions, discount, 277 | transition_probability, 278 | trajectories) 279 | svf = maxent.find_svf(n_states, trajectories.get_value()) 280 | # Derivatives (backward propagation). 281 | updates = [] 282 | α_grad = φs[-1].dot(svf - expected_svf).T 283 | hist_α_grad += α_grad**2 284 | adj_α_grad = α_grad/(adagrad_epsilon + T.sqrt(hist_α_grad)) 285 | updates.append((α, α + adj_α_grad*learning_rate)) 286 | 287 | def grad_for_state(s, theta, svf_diff, r): 288 | """ 289 | Calculate the gradient with respect to theta for one state. 290 | """ 291 | 292 | regularisation = abs(theta).sum()*l1 + (theta**2).sum()*l2 293 | return svf_diff[s] * T.grad(r[s], theta) - regularisation, {} 294 | 295 | for i, W in enumerate(weights): 296 | w_grads, _ = th.scan(fn=grad_for_state, 297 | sequences=[T.arange(n_states)], 298 | non_sequences=[W, svf - expected_svf, r]) 299 | w_grad = w_grads.sum(axis=0) 300 | hist_w_grads[i] += w_grad**2 301 | adj_w_grad = w_grad/(adagrad_epsilon + T.sqrt(hist_w_grads[i])) 302 | updates.append((W, W + adj_w_grad*learning_rate)) 303 | for i, b in enumerate(biases): 304 | b_grads, _ = th.scan(fn=grad_for_state, 305 | sequences=[T.arange(n_states)], 306 | non_sequences=[b, svf - expected_svf, r]) 307 | b_grad = b_grads.sum(axis=0) 308 | hist_b_grads[i] += b_grad**2 309 | adj_b_grad = b_grad/(adagrad_epsilon + T.sqrt(hist_b_grads[i])) 310 | updates.append((b, b + adj_b_grad*learning_rate)) 311 | 312 | train = th.function([s_feature_matrix], updates=updates, outputs=r) 313 | run = th.function([s_feature_matrix], outputs=r) 314 | 315 | for e in range(epochs): 316 | reward = train(feature_matrix) 317 | 318 | return reward.reshape((n_states,)) 319 | -------------------------------------------------------------------------------- /examples/experiments.py: -------------------------------------------------------------------------------- 1 | """ 2 | Perform the experiments from the report. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | from time import time 9 | from sys import stdout 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | 14 | from irl import maxent 15 | from irl import deep_maxent 16 | from irl import value_iteration 17 | from irl.mdp.gridworld import Gridworld 18 | from irl.mdp.objectworld import Objectworld 19 | 20 | def test_gw_once(grid_size, feature_map, n_samples, epochs, structure): 21 | """ 22 | Test MaxEnt and DeepMaxEnt on a gw of size grid_size with the feature 23 | map feature_map with n_samples paths. 24 | 25 | grid_size: Grid size. int. 26 | feature_map: Which feature map to use. String in {ident, coord, proxi}. 27 | n_samples: Number of paths to sample. 28 | epochs: Number of epochs to run MaxEnt with. 29 | structure: Neural network structure tuple, e.g. (3, 3) would be a 30 | 3-layer neural network with assumed inputs. 31 | -> Expected value difference for MaxEnt, DeepMaxEnt 32 | """ 33 | 34 | # Basic gist of what we're doing here: Get the reward function using our 35 | # different IRL methods, use those to get a policy, evaluate that policy 36 | # using the true reward, and then return the difference in expected values. 37 | 38 | # Setup parameters. 39 | wind = 0.3 40 | discount = 0.9 41 | learning_rate = 0.01 42 | trajectory_length = 3*grid_size 43 | 44 | # Make the gridworld and associated data. 45 | gw = Gridworld(grid_size, wind, discount) 46 | feature_matrix = gw.feature_matrix(feature_map) 47 | ground_reward = np.array([gw.reward(i) for i in range(gw.n_states)]) 48 | optimal_policy = value_iteration.find_policy(gw.n_states, 49 | gw.n_actions, 50 | gw.transition_probability, 51 | ground_reward, 52 | discount).argmax(axis=1) 53 | trajectories = gw.generate_trajectories(n_samples, 54 | trajectory_length, 55 | optimal_policy.take) 56 | p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) / 57 | trajectories.shape[0]) 58 | 59 | # True value. 60 | optimal_V = value_iteration.optimal_value(gw.n_states, 61 | gw.n_actions, 62 | gw.transition_probability, 63 | ground_reward, gw.discount) 64 | 65 | # MaxEnt reward; policy; value. 66 | maxent_reward = deep_maxent.irl((feature_matrix.shape[1],), 67 | feature_matrix, 68 | gw.n_actions, 69 | gw.discount, 70 | gw.transition_probability, 71 | trajectories, epochs, learning_rate) 72 | 73 | maxent_policy = value_iteration.find_policy(gw.n_states, 74 | gw.n_actions, 75 | gw.transition_probability, 76 | maxent_reward, 77 | discount).argmax(axis=1) 78 | maxent_V = value_iteration.value(maxent_policy, 79 | gw.n_states, 80 | gw.transition_probability, 81 | ground_reward, 82 | gw.discount) 83 | maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state) 84 | 85 | # DeepMaxEnt reward; policy; value. 86 | deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure, 87 | feature_matrix, 88 | gw.n_actions, 89 | gw.discount, 90 | gw.transition_probability, 91 | trajectories, epochs, learning_rate) 92 | deep_maxent_policy = value_iteration.find_policy(gw.n_states, 93 | gw.n_actions, 94 | gw.transition_probability, 95 | deep_maxent_reward, 96 | discount).argmax(axis=1) 97 | deep_maxent_V = value_iteration.value(deep_maxent_policy, 98 | gw.n_states, 99 | gw.transition_probability, 100 | ground_reward, 101 | gw.discount) 102 | deep_maxent_EVD = (optimal_V.dot(p_start_state) - 103 | deep_maxent_V.dot(p_start_state)) 104 | 105 | plt.subplot(3, 3, 1) 106 | plt.pcolor(ground_reward.reshape((grid_size, grid_size))) 107 | plt.title("Groundtruth reward") 108 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 109 | bottom=False, top=False, left=False, right=False, 110 | labelright=False) 111 | plt.subplot(3, 3, 2) 112 | plt.pcolor(maxent_reward.reshape((grid_size, grid_size))) 113 | plt.title("MaxEnt reward") 114 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 115 | bottom=False, top=False, left=False, right=False, 116 | labelright=False) 117 | plt.subplot(3, 3, 3) 118 | plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size))) 119 | plt.title("DeepMaxEnt reward") 120 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 121 | bottom=False, top=False, left=False, right=False, 122 | labelright=False) 123 | 124 | plt.subplot(3, 3, 4) 125 | plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) 126 | plt.title("Optimal policy") 127 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 128 | bottom=False, top=False, left=False, right=False, 129 | labelright=False) 130 | plt.subplot(3, 3, 5) 131 | plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) 132 | plt.title("MaxEnt policy") 133 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 134 | bottom=False, top=False, left=False, right=False, 135 | labelright=False) 136 | plt.subplot(3, 3, 6) 137 | plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)), 138 | vmin=0, vmax=3) 139 | plt.title("DeepMaxEnt policy") 140 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 141 | bottom=False, top=False, left=False, right=False, 142 | labelright=False) 143 | 144 | plt.subplot(3, 3, 7) 145 | plt.pcolor(optimal_V.reshape((grid_size, grid_size))) 146 | plt.title("Optimal value") 147 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 148 | bottom=False, top=False, left=False, right=False, 149 | labelright=False) 150 | plt.subplot(3, 3, 8) 151 | plt.pcolor(maxent_V.reshape((grid_size, grid_size))) 152 | plt.title("MaxEnt value") 153 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 154 | bottom=False, top=False, left=False, right=False, 155 | labelright=False) 156 | plt.subplot(3, 3, 9) 157 | plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size))) 158 | plt.title("DeepMaxEnt value") 159 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 160 | bottom=False, top=False, left=False, right=False, 161 | labelright=False) 162 | plt.savefig("{}_{}_{}_{}gridworld{}.png".format(grid_size, feature_map, 163 | n_samples, epochs, structure, np.random.randint(10000000))) 164 | 165 | 166 | return maxent_EVD, deep_maxent_EVD 167 | 168 | def test_ow_once(grid_size, n_objects, n_colours, discrete, l1, l2, n_samples, 169 | epochs, structure): 170 | """ 171 | Test MaxEnt and DeepMaxEnt on a ow of size grid_size with the feature 172 | map feature_map with n_samples paths. 173 | 174 | grid_size: Grid size. int. 175 | n_objects: Number of objects. int. 176 | n_colours: Number of colours. int. 177 | discrete: Whether the features should be discrete. bool. 178 | l1: L1 regularisation. float. 179 | l2: L2 regularisation. float. 180 | n_samples: Number of paths to sample. 181 | epochs: Number of epochs to run MaxEnt with. 182 | structure: Neural network structure tuple, e.g. (3, 3) would be a 183 | 3-layer neural network with assumed inputs. 184 | -> Expected value difference for MaxEnt, DeepMaxEnt 185 | """ 186 | 187 | # Basic gist of what we're doing here: Get the reward function using our 188 | # different IRL methods, use those to get a policy, evaluate that policy 189 | # using the true reward, and then return the difference in expected values. 190 | 191 | # Setup parameters. 192 | wind = 0.3 193 | discount = 0.9 194 | learning_rate = 0.01 195 | trajectory_length = 3*grid_size 196 | 197 | # Make the objectworld and associated data. 198 | ow = Objectworld(grid_size, n_objects, n_colours, wind, discount) 199 | feature_matrix = ow.feature_matrix(discrete) 200 | ground_reward = np.array([ow.reward(i) for i in range(ow.n_states)]) 201 | optimal_policy = value_iteration.find_policy(ow.n_states, 202 | ow.n_actions, 203 | ow.transition_probability, 204 | ground_reward, 205 | discount).argmax(axis=1) 206 | trajectories = ow.generate_trajectories(n_samples, 207 | trajectory_length, 208 | optimal_policy.take) 209 | p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) / 210 | trajectories.shape[0]) 211 | 212 | # True value. 213 | optimal_V = value_iteration.optimal_value(ow.n_states, 214 | ow.n_actions, 215 | ow.transition_probability, 216 | ground_reward, ow.discount) 217 | 218 | # MaxEnt reward; policy; value. 219 | maxent_reward = deep_maxent.irl((feature_matrix.shape[1],), 220 | feature_matrix, 221 | ow.n_actions, 222 | ow.discount, 223 | ow.transition_probability, 224 | trajectories, epochs, learning_rate, 225 | l1=l1, l2=l2) 226 | 227 | maxent_policy = value_iteration.find_policy(ow.n_states, 228 | ow.n_actions, 229 | ow.transition_probability, 230 | maxent_reward, 231 | discount).argmax(axis=1) 232 | maxent_V = value_iteration.value(maxent_policy, 233 | ow.n_states, 234 | ow.transition_probability, 235 | ground_reward, 236 | ow.discount) 237 | maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state) 238 | 239 | # DeepMaxEnt reward; policy; value. 240 | deep_learning_rate = 0.005 # For the 32 x 32 experiments. 241 | deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure, 242 | feature_matrix, 243 | ow.n_actions, 244 | ow.discount, 245 | ow.transition_probability, 246 | trajectories, epochs, 247 | deep_learning_rate, 248 | l1=l1, l2=l2) 249 | 250 | deep_maxent_policy = value_iteration.find_policy(ow.n_states, 251 | ow.n_actions, 252 | ow.transition_probability, 253 | deep_maxent_reward, 254 | discount).argmax(axis=1) 255 | deep_maxent_V = value_iteration.value(deep_maxent_policy, 256 | ow.n_states, 257 | ow.transition_probability, 258 | ground_reward, 259 | ow.discount) 260 | 261 | deep_maxent_EVD = (optimal_V.dot(p_start_state) - 262 | deep_maxent_V.dot(p_start_state)) 263 | 264 | plt.subplot(3, 3, 1) 265 | plt.pcolor(ground_reward.reshape((grid_size, grid_size))) 266 | plt.title("Groundtruth reward") 267 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 268 | bottom=False, top=False, left=False, right=False, labelright=False) 269 | plt.subplot(3, 3, 2) 270 | plt.pcolor(maxent_reward.reshape((grid_size, grid_size))) 271 | plt.title("MaxEnt reward") 272 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 273 | bottom=False, top=False, left=False, right=False, labelright=False) 274 | plt.subplot(3, 3, 3) 275 | plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size))) 276 | plt.title("DeepMaxEnt reward") 277 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 278 | bottom=False, top=False, left=False, right=False, labelright=False) 279 | 280 | plt.subplot(3, 3, 4) 281 | plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) 282 | plt.title("Optimal policy") 283 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 284 | bottom=False, top=False, left=False, right=False, labelright=False) 285 | plt.subplot(3, 3, 5) 286 | plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) 287 | plt.title("MaxEnt policy") 288 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 289 | bottom=False, top=False, left=False, right=False, labelright=False) 290 | plt.subplot(3, 3, 6) 291 | plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)), 292 | vmin=0, vmax=3) 293 | plt.title("DeepMaxEnt policy") 294 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 295 | bottom=False, top=False, left=False, right=False, labelright=False) 296 | 297 | plt.subplot(3, 3, 7) 298 | plt.pcolor(optimal_V.reshape((grid_size, grid_size))) 299 | plt.title("Optimal value") 300 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 301 | bottom=False, top=False, left=False, right=False, labelright=False) 302 | plt.subplot(3, 3, 8) 303 | plt.pcolor(maxent_V.reshape((grid_size, grid_size))) 304 | plt.title("MaxEnt value") 305 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 306 | bottom=False, top=False, left=False, right=False, labelright=False) 307 | plt.subplot(3, 3, 9) 308 | plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size))) 309 | plt.title("DeepMaxEnt value") 310 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 311 | bottom=False, top=False, left=False, right=False, labelright=False) 312 | plt.savefig("{}_{}_{}_{}_{}_{}_{}_{}_{}_objectworld_{}.png".format( 313 | grid_size, n_objects, n_colours, discrete, n_samples, epochs, structure, 314 | l1, l2, np.random.randint(10000000))) 315 | 316 | return maxent_EVD, deep_maxent_EVD 317 | 318 | def test_gw_over_samples(grid_size, feature_map, epochs, structure, n): 319 | """ 320 | Test MaxEnt and DeepMaxEnt on a gridworld of size grid_size with the feature 321 | map feature_map with different numbers of paths. 322 | 323 | grid_size: Grid size. int. 324 | feature_map: Which feature map to use. String in {ident, coord, proxi}. 325 | epochs: MaxEnt iterations. int. 326 | structure: Neural network structure tuple, e.g. (3, 3) would be a 327 | 3-layer neural network with assumed inputs. 328 | n: Iterations. int. 329 | -> (MaxEnt [(n_samples, mean expected value difference, stdev)], 330 | DeepMaxEnt [(n_samples, mean expected value difference, stdev)]), 331 | raw data (maxent_data, deep_maxent_data) 332 | """ 333 | 334 | maxent_data = [] 335 | deep_maxent_data = [] 336 | for n_samples in (32,): 337 | t = time() 338 | maxent_EVDs = [] 339 | deep_maxent_EVDs = [] 340 | for i in range(n): 341 | print("{}: {}/{}".format(n_samples, i+1, n)) 342 | maxent_EVD, deep_maxent_EVD = test_gw_once(grid_size, feature_map, 343 | n_samples, epochs, 344 | structure) 345 | maxent_EVDs.append(maxent_EVD) 346 | deep_maxent_EVDs.append(deep_maxent_EVD) 347 | print(maxent_EVD, deep_maxent_EVD) 348 | stdout.flush() 349 | maxent_data.append((n_samples, np.mean(maxent_EVDs), 350 | np.std(maxent_EVDs))) 351 | deep_maxent_data.append((n_samples, np.mean(deep_maxent_EVDs), 352 | np.std(deep_maxent_EVDs))) 353 | print("{} (took {:.02}s)".format(n_samples, time() - t)) 354 | print("MaxEnt:", maxent_data) 355 | print("DeepMaxEnt:", deep_maxent_data) 356 | return maxent_data, deep_maxent_data 357 | 358 | def test_ow_over_samples(grid_size, n_objects, n_colours, discrete, l1, l2, 359 | epochs, structure, n): 360 | """ 361 | Test MaxEnt and DeepMaxEnt on an objectworld with different numbers of paths. 362 | 363 | grid_size: Grid size. int. 364 | n_objects: Number of objects. int. 365 | n_colours: Number of colours. int. 366 | discrete: Whether the features should be discrete. bool. 367 | feature_map: Which feature map to use. String in {ident, coord, proxi}. 368 | l1: L1 regularisation. float. 369 | l2: L2 regularisation. float. 370 | epochs: MaxEnt iterations. int. 371 | structure: Neural network structure tuple, e.g. (3, 3) would be a 372 | 3-layer neural network with assumed inputs. 373 | n: Iterations. int. 374 | -> (MaxEnt [(n_samples, mean expected value difference, stdev)], 375 | DeepMaxEnt [(n_samples, mean expected value difference, stdev)]), 376 | raw data (maxent_data, deep_maxent_data) 377 | """ 378 | 379 | maxent_data = [] 380 | deep_maxent_data = [] 381 | for n_samples in (32, 16, 8, 4): 382 | t = time() 383 | maxent_EVDs = [] 384 | deep_maxent_EVDs = [] 385 | for i in range(n): 386 | print("{}: {}/{}".format(n_samples, i+1, n)) 387 | maxent_EVD, deep_maxent_EVD = test_ow_once(grid_size, n_objects, 388 | n_colours, discrete, l1, l2, n_samples, epochs, structure) 389 | maxent_EVDs.append(maxent_EVD) 390 | deep_maxent_EVDs.append(deep_maxent_EVD) 391 | print(maxent_EVD, deep_maxent_EVD) 392 | stdout.flush() 393 | maxent_data.append((n_samples, np.mean(maxent_EVDs), 394 | np.median(maxent_EVDs), np.std(maxent_EVDs))) 395 | deep_maxent_data.append((n_samples, np.mean(deep_maxent_EVDs), 396 | np.median(deep_maxent_EVDs), np.std(deep_maxent_EVDs))) 397 | print("{} (took {:.02}s)".format(n_samples, time() - t)) 398 | print("MaxEnt:", maxent_data) 399 | print("DeepMaxEnt:", deep_maxent_data) 400 | return maxent_data, deep_maxent_data 401 | 402 | if __name__ == '__main__': 403 | # Tests the 16 x 16 objectworld. 404 | print(test_ow_over_samples(16, 25, 2, False, 0, 0, 150, (3, 3), 10)) 405 | # Tests the 32 x 32 objectworld. 406 | print(test_ow_over_samples(32, 50, 2, False, 0, 0, 250, (3, 3), 5)) --------------------------------------------------------------------------------