├── irl ├── __init__.py ├── mdp │ ├── __init__.py │ ├── __init__.pyc │ ├── gridworld.pyc │ ├── gridworld_test.py │ ├── objectworld.py │ └── gridworld.py ├── maxent.pyc ├── __init__.pyc ├── value_iteration.pyc ├── value_iteration.py ├── linear_irl.py ├── maxent.py └── deep_maxent.py ├── hierarchicalrl ├── sdp_maxent.pyc ├── options_grid_test.pyc ├── options_grid_world.pyc ├── sdp_value_iteration.pyc ├── options_grid_test.py ├── options_maxent.py ├── optionsUsing-nopid.py ├── sdp_value_iteration.py ├── sdp_maxent.py └── options_grid_world.py ├── LICENSE ├── examples ├── lp_gridworld.py ├── maxent_gridworld.py ├── lp_large_gridworld.py ├── maxent_objectworld.py ├── deep_maxent_objectworld.py └── experiments.py ├── README.md └── options-using-q ├── basicOptions.py ├── qLearning.py ├── optionsUsing.py ├── optionsUsing-nopid.py ├── basicOption-tworooms.py └── options-temp.py /irl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irl/mdp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /irl/maxent.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/maxent.pyc -------------------------------------------------------------------------------- /irl/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/__init__.pyc -------------------------------------------------------------------------------- /irl/mdp/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/mdp/__init__.pyc -------------------------------------------------------------------------------- /irl/mdp/gridworld.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/mdp/gridworld.pyc -------------------------------------------------------------------------------- /irl/value_iteration.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/value_iteration.pyc -------------------------------------------------------------------------------- /hierarchicalrl/sdp_maxent.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/hierarchicalrl/sdp_maxent.pyc -------------------------------------------------------------------------------- /hierarchicalrl/options_grid_test.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/hierarchicalrl/options_grid_test.pyc -------------------------------------------------------------------------------- /hierarchicalrl/options_grid_world.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/hierarchicalrl/options_grid_world.pyc -------------------------------------------------------------------------------- /hierarchicalrl/sdp_value_iteration.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/hierarchicalrl/sdp_value_iteration.pyc -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Matthew Alger 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /irl/mdp/gridworld_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for the gridworld MDP. 3 | 4 | Matthew Alger, 2016 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import unittest 9 | 10 | import numpy as np 11 | import numpy.random as rn 12 | 13 | import gridworld 14 | 15 | 16 | def make_random_gridworld(): 17 | grid_size = rn.randint(2, 15) 18 | wind = rn.uniform(0.0, 1.0) 19 | discount = rn.uniform(0.0, 1.0) 20 | return gridworld.Gridworld(grid_size, wind, discount) 21 | 22 | 23 | class TestTransitionProbability(unittest.TestCase): 24 | """Tests for Gridworld.transition_probability.""" 25 | 26 | def test_sums_to_one(self): 27 | """Tests that the sum of transition probabilities is approximately 1.""" 28 | # This is a simple fuzz-test. 29 | for _ in range(40): 30 | gw = make_random_gridworld() 31 | self.assertTrue( 32 | np.isclose(gw.transition_probability.sum(axis=2), 1).all(), 33 | 'Probabilities don\'t sum to 1: {}'.format(gw)) 34 | 35 | def test_manual_sums_to_one(self): 36 | """Tests issue #1 on GitHub.""" 37 | gw = gridworld.Gridworld(5, 0.3, 0.2) 38 | self.assertTrue( 39 | np.isclose(gw.transition_probability.sum(axis=2), 1).all()) 40 | 41 | if __name__ == '__main__': 42 | unittest.main() -------------------------------------------------------------------------------- /examples/lp_gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run linear programming inverse reinforcement learning on the gridworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | import irl.linear_irl as linear_irl 12 | import irl.mdp.gridworld as gridworld 13 | 14 | def main(grid_size, discount): 15 | """ 16 | Run linear programming inverse reinforcement learning on the gridworld MDP. 17 | 18 | Plots the reward function. 19 | 20 | grid_size: Grid size. int. 21 | discount: MDP discount factor. float. 22 | """ 23 | 24 | wind = 0.3 25 | trajectory_length = 3*grid_size 26 | 27 | gw = gridworld.Gridworld(grid_size, wind, discount) 28 | 29 | ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) 30 | policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] 31 | r = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability, 32 | policy, gw.discount, 1, 5) 33 | 34 | plt.subplot(1, 2, 1) 35 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 36 | plt.colorbar() 37 | plt.title("Groundtruth reward") 38 | plt.subplot(1, 2, 2) 39 | plt.pcolor(r.reshape((grid_size, grid_size))) 40 | plt.colorbar() 41 | plt.title("Recovered reward") 42 | plt.show() 43 | 44 | if __name__ == '__main__': 45 | main(5, 0.2) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hierarchical Inverse Reinforcement Learning 2 | 3 | Extends [M. Alger's](https://doi.org/10.5281/zenodo.555999) implementation of selected inverse reinforcement learning (IRL) algorithms. A summary report of my work is available [here](https://www.overleaf.com/read/mkkfqgpnbvnr). His final report is available [here](http://matthewja.com/pdfs/irl.pdf) and describes the implemented algorithms. 4 | 5 | If you use this code in your work, you can cite it as follows: 6 | ```bibtex 7 | @misc{davchev17, 8 | author = {Todor Davchev}, 9 | title = {Hierarchical Inverse Reinforcement Learning}, 10 | year = 2017 11 | } 12 | ``` 13 | If you are only interested in the IRL aspect of this project, you can find at [Alger's repo](https://github.com/MatthewJA/Inverse-Reinforcement-Learning) 14 | ## Algorithms implemented 15 | 16 | - Linear programming IRL. From Ng & Russell, 2000. Small state space and large state space linear programming IRL. 17 | - Maximum entropy IRL. From Ziebart et al., 2008. 18 | - Deep maximum entropy IRL. From Wulfmeier et al., 2015; original derivation. 19 | - Hierarchical MaxEnt IRL. 20 | 21 | Additionally, the following MDP and semi-MDP domains are implemented: 22 | - Gridworld (Sutton, 1998) 23 | - Extended Gridworld with options (Sutton, 1998) 24 | - Objectworld (Levine et al., 2011) 25 | 26 | ## Requirements 27 | - NumPy 28 | - SciPy 29 | - CVXOPT 30 | - Theano 31 | - MatPlotLib (for examples) 32 | -------------------------------------------------------------------------------- /examples/maxent_gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run maximum entropy inverse reinforcement learning on the gridworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | import sys 12 | sys.path.append("/Users/todordavchev/Documents/temp/") 13 | 14 | import irl.maxent as maxent 15 | import irl.mdp.gridworld as gridworld 16 | 17 | def main(grid_size, discount, n_trajectories, epochs, learning_rate): 18 | """ 19 | Run maximum entropy inverse reinforcement learning on the gridworld MDP. 20 | 21 | Plots the reward function. 22 | 23 | grid_size: Grid size. int. 24 | discount: MDP discount factor. float. 25 | n_trajectories: Number of sampled trajectories. int. 26 | epochs: Gradient descent iterations. int. 27 | learning_rate: Gradient descent learning rate. float. 28 | """ 29 | 30 | wind = 0.3 31 | trajectory_length = 3*grid_size 32 | 33 | gw = gridworld.Gridworld(grid_size, wind, discount) 34 | trajectories = gw.generate_trajectories(n_trajectories, 35 | trajectory_length, 36 | gw.optimal_policy) 37 | feature_matrix = gw.feature_matrix() 38 | ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) 39 | r = maxent.irl(feature_matrix, gw.n_actions, discount, 40 | gw.transition_probability, trajectories, epochs, learning_rate) 41 | 42 | plt.subplot(1, 2, 1) 43 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 44 | plt.colorbar() 45 | plt.title("Groundtruth reward") 46 | plt.subplot(1, 2, 2) 47 | plt.pcolor(r.reshape((grid_size, grid_size))) 48 | plt.colorbar() 49 | plt.title("Recovered reward") 50 | plt.show() 51 | 52 | if __name__ == '__main__': 53 | main(5, 0.01, 20, 200, 0.01) 54 | -------------------------------------------------------------------------------- /hierarchicalrl/options_grid_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for the gridworld MDP. 3 | 4 | Todor Davchev, 2017 5 | t.b.davchev@ed.ac.uk 6 | """ 7 | 8 | import unittest 9 | 10 | import numpy as np 11 | import numpy.random as rn 12 | 13 | import options_grid_world as gridworld 14 | 15 | 16 | def make_random_gridworld(): 17 | grid_size = rn.randint(2, 15) 18 | wind = rn.uniform(0.0, 1.0) 19 | discount = rn.uniform(0.0, 1.0) 20 | return gridworld.Gridworld(grid_size, wind, discount) 21 | 22 | 23 | class TestTransitionProbability(unittest.TestCase): 24 | """Tests for Gridworld.transition_probability.""" 25 | 26 | # def test_sums_to_one(self): 27 | # """Tests that the sum of transition probabilities is approximately 1.""" 28 | # # This is a simple fuzz-test. 29 | # for _ in range(40): 30 | # gw = make_random_gridworld() 31 | # self.assertTrue( 32 | # np.isclose(gw.transition_probability.sum(axis=2), 1).all(), 33 | # 'Probabilities don\'t sum to 1: {}'.format(gw)) 34 | 35 | def test_manual_sums_to_one(self): 36 | """Tests issue #1 on GitHub.""" 37 | walls = [ 38 | (5, 0), (5, 1), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 10), 39 | (0, 5), (2, 5), (3, 5), (4, 5), 40 | (6, 6), (7, 6), (9, 6), (10, 6) 41 | ] 42 | gw = gridworld.Large_Gridworld(11, walls, 0.3, 0.2) 43 | self.assertTrue( 44 | np.isclose(gw.options_transition_probability.sum(axis=2), 1).all()) 45 | 46 | # take out all walls since their probabilities == 0 47 | bb = gw.improved_transition_probability.sum(axis=3) 48 | aa = gw.transition_probability.sum(axis=2) 49 | self.assertTrue( 50 | np.isclose([x for i, x in enumerate(aa) if x.all() != 0.], 1).all()) 51 | 52 | if __name__ == '__main__': 53 | unittest.main() -------------------------------------------------------------------------------- /examples/lp_large_gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run large state space linear programming inverse reinforcement learning on the 3 | gridworld MDP. 4 | 5 | Matthew Alger, 2015 6 | matthew.alger@anu.edu.au 7 | """ 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | import irl.linear_irl as linear_irl 13 | import irl.mdp.gridworld as gridworld 14 | from irl.value_iteration import value 15 | 16 | def main(grid_size, discount): 17 | """ 18 | Run large state space linear programming inverse reinforcement learning on 19 | the gridworld MDP. 20 | 21 | Plots the reward function. 22 | 23 | grid_size: Grid size. int. 24 | discount: MDP discount factor. float. 25 | """ 26 | 27 | wind = 0.3 28 | trajectory_length = 3*grid_size 29 | 30 | gw = gridworld.Gridworld(grid_size, wind, discount) 31 | 32 | ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) 33 | policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] 34 | 35 | # Need a value function for each basis function. 36 | feature_matrix = gw.feature_matrix() 37 | values = [] 38 | for dim in range(feature_matrix.shape[1]): 39 | reward = feature_matrix[:, dim] 40 | values.append(value(policy, gw.n_states, gw.transition_probability, 41 | reward, gw.discount)) 42 | values = np.array(values) 43 | 44 | r = linear_irl.large_irl(values, gw.transition_probability, 45 | feature_matrix, gw.n_states, gw.n_actions, policy) 46 | 47 | plt.subplot(1, 2, 1) 48 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 49 | plt.colorbar() 50 | plt.title("Groundtruth reward") 51 | plt.subplot(1, 2, 2) 52 | plt.pcolor(r.reshape((grid_size, grid_size))) 53 | plt.colorbar() 54 | plt.title("Recovered reward") 55 | plt.show() 56 | 57 | if __name__ == '__main__': 58 | main(10, 0.9) 59 | -------------------------------------------------------------------------------- /examples/maxent_objectworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run maximum entropy inverse reinforcement learning on the objectworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | import sys 12 | sys.path.append("/home/todor/Documents/workspace/smdp") 13 | 14 | import irl.maxent as maxent 15 | import irl.mdp.objectworld as objectworld 16 | from irl.value_iteration import find_policy 17 | 18 | def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, 19 | learning_rate): 20 | """ 21 | Run maximum entropy inverse reinforcement learning on the objectworld MDP. 22 | 23 | Plots the reward function. 24 | 25 | grid_size: Grid size. int. 26 | discount: MDP discount factor. float. 27 | n_objects: Number of objects. int. 28 | n_colours: Number of colours. int. 29 | n_trajectories: Number of sampled trajectories. int. 30 | epochs: Gradient descent iterations. int. 31 | learning_rate: Gradient descent learning rate. float. 32 | """ 33 | 34 | wind = 0.3 35 | trajectory_length = 8 36 | 37 | ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, 38 | discount) 39 | ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) 40 | policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, 41 | ground_r, ow.discount, stochastic=False) 42 | trajectories = ow.generate_trajectories(n_trajectories, 43 | trajectory_length, 44 | lambda s: policy[s]) 45 | feature_matrix = ow.feature_matrix(discrete=False) 46 | r = maxent.irl(feature_matrix, ow.n_actions, discount, 47 | ow.transition_probability, trajectories, epochs, learning_rate) 48 | 49 | plt.subplot(1, 2, 1) 50 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 51 | plt.colorbar() 52 | plt.title("Groundtruth reward") 53 | plt.subplot(1, 2, 2) 54 | plt.pcolor(r.reshape((grid_size, grid_size))) 55 | plt.colorbar() 56 | plt.title("Recovered reward") 57 | plt.show() 58 | 59 | if __name__ == '__main__': 60 | main(10, 0.9, 15, 2, 20, 50, 0.01) 61 | -------------------------------------------------------------------------------- /examples/deep_maxent_objectworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run maximum entropy inverse reinforcement learning on the objectworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | import sys 12 | sys.path.append("/home/todor/Documents/workspace/smdp") 13 | 14 | import irl.deep_maxent as deep_maxent 15 | import irl.mdp.objectworld as objectworld 16 | from irl.value_iteration import find_policy 17 | 18 | def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, 19 | learning_rate, structure): 20 | """ 21 | Run deep maximum entropy inverse reinforcement learning on the objectworld 22 | MDP. 23 | 24 | Plots the reward function. 25 | 26 | grid_size: Grid size. int. 27 | discount: MDP discount factor. float. 28 | n_objects: Number of objects. int. 29 | n_colours: Number of colours. int. 30 | n_trajectories: Number of sampled trajectories. int. 31 | epochs: Gradient descent iterations. int. 32 | learning_rate: Gradient descent learning rate. float. 33 | structure: Neural network structure. Tuple of hidden layer dimensions, e.g., 34 | () is no neural network (linear maximum entropy) and (3, 4) is two 35 | hidden layers with dimensions 3 and 4. 36 | """ 37 | 38 | wind = 0.3 39 | trajectory_length = 8 40 | l1 = l2 = 0 41 | 42 | ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, 43 | discount) 44 | ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) 45 | policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, 46 | ground_r, ow.discount, stochastic=False) 47 | trajectories = ow.generate_trajectories(n_trajectories, 48 | trajectory_length, 49 | lambda s: policy[s]) 50 | feature_matrix = ow.feature_matrix(discrete=False) 51 | r = deep_maxent.irl((feature_matrix.shape[1],) + structure, feature_matrix, 52 | ow.n_actions, discount, ow.transition_probability, trajectories, epochs, 53 | learning_rate, l1=l1, l2=l2) 54 | 55 | plt.subplot(1, 2, 1) 56 | plt.pcolor(ground_r.reshape((grid_size, grid_size))) 57 | plt.colorbar() 58 | plt.title("Groundtruth reward") 59 | plt.subplot(1, 2, 2) 60 | plt.pcolor(r.reshape((grid_size, grid_size))) 61 | plt.colorbar() 62 | plt.title("Recovered reward") 63 | plt.show() 64 | 65 | if __name__ == '__main__': 66 | main(10, 0.9, 15, 2, 20, 50, 0.01, (3, 3)) 67 | -------------------------------------------------------------------------------- /options-using-q/basicOptions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | Sx = 7 6 | Sy = 7 7 | S = Sx*Sy 8 | P = 5 # there is a state for being in the taxi 9 | G = 4 10 | R = 0 11 | O = 2 12 | maxR = -999999 13 | hallways = [44, 27] 14 | rooms = [[ 15 | 8, 9, 10, 11, 12, 16 | 15, 16 ,17, 18, 19, 17 | 22, 23, 24, 25, 26, 27, 18 | 36, 37, 38, 39, 40, 19 | 44 20 | ]] 21 | 22 | walls = [[ 23 | 0, 1, 2, 3, 4, 5, 6, 24 | 7, 14, 21, 28, 35, 42, 25 | 43, 45, 46, 47, 48, 26 | 13, 20, 34, 41, 48, 27 | 51 28 | ]] 29 | 30 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3] 31 | pickUps = [44, 27] 32 | A = 6 33 | T = 3000 34 | stepNo = 0 35 | avg_reward = np.zeros([Sx, Sy, P, G]) 36 | reward = np.zeros([P, G]) 37 | avg = np.zeros([T,1]) 38 | time_course = np.zeros([T, 3]) 39 | Q = 0.1*np.random.rand(S, O, A, P, G) 40 | for i in xrange(49): 41 | for o in xrange(O): 42 | for a in xrange(A): 43 | for p in xrange(P): 44 | for g in xrange(G): 45 | if i not in rooms[0]: 46 | Q[i, o, a, p, g] = 0 47 | 48 | V = [np.max(Q[:, o, :], axis=1) for o in xrange(O)] 49 | eta = 0.1 50 | gamma = 0.9 51 | epsilon = 0.1 52 | reward_course = np.zeros([T, 1]) 53 | reward_mean = np.zeros([T, 1]) 54 | 55 | option = 0 56 | 57 | stepsToGoal = np.zeros([T, 1]) 58 | maxV = -9999 59 | for t in xrange(T): 60 | plocation = pickUps[1 - option] 61 | pID = [i for i, x in enumerate(pickUps) if x == plocation][0] 62 | Goal = pickUps[option] 63 | p0 = plocation 64 | gID = [i for i, x in enumerate(pickUps) if x == Goal][0] 65 | s0 = np.random.choice([state for state in xrange(S) if state not in walls[0]]) 66 | state = [s0, pID, gID] #[{1..25} {1..5} {1..4}] 67 | for u in xrange(S**2): 68 | if (stepNo > 30): 69 | stepNo = 0 70 | break 71 | 72 | r = 0 73 | [V[option][s0, pID, gID], a0] = [np.max(Q[s0, option, :, pID, gID]), np.argmax(Q[s0, option, :, pID, gID])] 74 | if (np.random.rand(1) < epsilon): 75 | a0 = np.random.choice(A) 76 | 77 | 78 | if a0 == 4: 79 | if pID != 4: 80 | if s0 == pickUps[pID]: 81 | r = 1 82 | pID = 4 83 | stepNo = 0 84 | else: 85 | r = -1 86 | else: 87 | r = -1 88 | 89 | 90 | if a0 == 5: 91 | if (s0 == pickUps[gID]) and pID==4: 92 | stepsToGoal[t] = stepNo 93 | r = 10/float(stepNo) 94 | if maxR < r: 95 | maxR = r 96 | 97 | stepNo = 0 98 | else: 99 | r = -1 100 | 101 | 102 | if a0 == 0: 103 | s1 = s0 - Sx 104 | if s1 in walls[0]: 105 | s1 = s1 + Sx 106 | r = -1 107 | 108 | 109 | if a0 == 1: 110 | s1 = s0 + Sx 111 | if s1 in walls[0]: 112 | s1 = s1 - Sx 113 | r = -1 114 | 115 | 116 | if a0 == 2: 117 | s1 = s0 - 1 118 | if s1 in walls[0]: 119 | s1 = s1 + 1 120 | r = -1 121 | 122 | 123 | if a0 == 3: 124 | s1 = s0 + 1 125 | if s1 in walls[0]: 126 | s1 = s1 - 1 127 | r = -1 128 | 129 | if a0 == 4: 130 | s1 = s0 131 | 132 | 133 | if a0 == 5: 134 | s1 = s0 135 | 136 | 137 | # learning step 138 | if t > 1000: 139 | R += r 140 | 141 | 142 | # print r 143 | FullR = R + r 144 | reward_course[t] = r 145 | reward_mean[t] = R/float(t+1) 146 | 147 | 148 | V[option][s1, pID, gID] = np.max(Q[s1, option, :, pID, gID]) 149 | 150 | 151 | if maxV < V[option][s1, pID, gID]: 152 | maxV = V[option][s1, pID, gID] 153 | 154 | 155 | time_course[t, 0] = V[option][s1, pID, gID] 156 | time_course[t, 1] = eta*(r+gamma*V[option][s1, pID, gID]) 157 | time_course[t, 2] = (1-eta)*Q[s0, option, a0, pID, gID] 158 | Q[s0, option, a0, pID, gID] = (1-eta)*Q[s0, option, a0, pID, gID] + \ 159 | eta*(r + gamma*V[option][s1, pID, gID]) 160 | if pID == 4: 161 | stepNo += 1 162 | 163 | 164 | if (s0 == pickUps[gID]) and (a0 == 5): 165 | stepNo = 0 166 | break 167 | 168 | 169 | s0 = s1 170 | 171 | 172 | avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :])))) 173 | 174 | 175 | meanR = R/float(T-1000) 176 | fullMR = FullR/float(T) 177 | print meanR 178 | print fullMR 179 | print maxV 180 | policy = [np.max(Q[i, option, :, pID, gID]) for i in xrange(S)] 181 | policy_actions = [np.argmax(Q[i, option, :, pID, gID]) for i in xrange(S)] 182 | print len(policy) 183 | policy_actions = np.reshape(policy_actions, [7, 7]) 184 | 185 | 186 | for i in xrange(7): 187 | for j in xrange(7): 188 | print "{0} ".format(policy_actions[i, j]), 189 | 190 | print " " -------------------------------------------------------------------------------- /irl/value_iteration.py: -------------------------------------------------------------------------------- 1 | """ 2 | Find the value function associated with a policy. Based on Sutton & Barto, 1998. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | 10 | def value(policy, n_states, transition_probabilities, reward, discount, 11 | threshold=1e-2): 12 | """ 13 | Find the value function associated with a policy. 14 | 15 | policy: List of action ints for each state. 16 | n_states: Number of states. int. 17 | transition_probabilities: Function taking (state, action, state) to 18 | transition probabilities. 19 | reward: Vector of rewards for each state. 20 | discount: MDP discount factor. float. 21 | threshold: Convergence threshold, default 1e-2. float. 22 | -> Array of values for each state 23 | """ 24 | v = np.zeros(n_states) 25 | 26 | diff = float("inf") 27 | while diff > threshold: 28 | diff = 0 29 | for s in range(n_states): 30 | vs = v[s] 31 | a = policy[s] 32 | v[s] = sum(transition_probabilities[s, a, k] * 33 | (reward[k] + discount * v[k]) 34 | for k in range(n_states)) 35 | diff = max(diff, abs(vs - v[s])) 36 | 37 | return v 38 | 39 | def optimal_value(n_states, n_actions, transition_probabilities, reward, 40 | discount, threshold=1e-2): 41 | """ 42 | Find the optimal value function. 43 | 44 | n_states: Number of states. int. 45 | n_actions: Number of actions. int. 46 | transition_probabilities: Function taking (state, action, state) to 47 | transition probabilities. 48 | reward: Vector of rewards for each state. 49 | discount: MDP discount factor. float. 50 | threshold: Convergence threshold, default 1e-2. float. 51 | -> Array of values for each state 52 | """ 53 | 54 | v = np.zeros(n_states) 55 | 56 | diff = float("inf") 57 | while diff > threshold: 58 | diff = 0 59 | for s in range(n_states): 60 | max_v = float("-inf") 61 | for a in range(n_actions): 62 | tp = transition_probabilities[s, a, :] 63 | # max_v = max(max_v, sum(reward + np.dot(tp, discount*v))) 64 | max_v = max(max_v, np.dot(tp, reward + discount*v)) 65 | 66 | new_diff = abs(v[s] - max_v) 67 | if new_diff > diff: 68 | diff = new_diff 69 | v[s] = max_v 70 | 71 | return v 72 | 73 | def find_policy(n_states, n_actions, transition_probabilities, reward, discount, 74 | threshold=1e-2, v=None, stochastic=True): 75 | """ 76 | Find the optimal policy. 77 | 78 | n_states: Number of states. int. 79 | n_actions: Number of actions. int. 80 | transition_probabilities: Function taking (state, action, state) to 81 | transition probabilities. 82 | reward: Vector of rewards for each state. 83 | discount: MDP discount factor. float. 84 | threshold: Convergence threshold, default 1e-2. float. 85 | v: Value function (if known). Default None. 86 | stochastic: Whether the policy should be stochastic. Default True. 87 | -> Action probabilities for each state or action int for each state 88 | (depending on stochasticity). 89 | """ 90 | 91 | if v is None: 92 | v = optimal_value(n_states, n_actions, transition_probabilities, reward, 93 | discount, threshold) 94 | 95 | if stochastic: 96 | # Get Q using equation 9.2 from Ziebart's thesis. 97 | Q = np.zeros((n_states, n_actions)) 98 | for i in range(n_states): 99 | for j in range(n_actions): 100 | p = transition_probabilities[i, j, :] 101 | Q[i, j] = p.dot(reward + discount*v) 102 | Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability. 103 | Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1)) 104 | return Q 105 | 106 | def _policy(s): 107 | return max(range(n_actions), 108 | key=lambda a: sum(transition_probabilities[s, a, k] * 109 | (reward[k] + discount * v[k]) 110 | for k in range(n_states))) 111 | policy = np.array([_policy(s) for s in range(n_states)]) 112 | return policy 113 | 114 | if __name__ == '__main__': 115 | # Quick unit test using gridworld. 116 | import mdp.gridworld as gridworld 117 | gw = gridworld.Gridworld(3, 0.3, 0.9) 118 | v = value([gw.optimal_policy_deterministic(s) for s in range(gw.n_states)], 119 | gw.n_states, 120 | gw.transition_probability, 121 | [gw.reward(s) for s in range(gw.n_states)], 122 | gw.discount) 123 | assert np.isclose(v, 124 | [5.7194282, 6.46706692, 6.42589811, 125 | 6.46706692, 7.47058224, 7.96505174, 126 | 6.42589811, 7.96505174, 8.19268666], 1).all() 127 | opt_v = optimal_value(gw.n_states, 128 | gw.n_actions, 129 | gw.transition_probability, 130 | [gw.reward(s) for s in range(gw.n_states)], 131 | gw.discount) 132 | assert np.isclose(v, opt_v).all() 133 | -------------------------------------------------------------------------------- /options-using-q/qLearning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | Sx = 5 6 | Sy = 5 7 | S = Sx*Sy 8 | P = 5 # there is a state for being in the taxi 9 | G = 4 10 | R = 0 11 | maxR = -999999 12 | pickUps = [0, Sx-1, S-Sx, S-2] 13 | A = 6 14 | T = 3000 15 | stepNo = 0 16 | avg_reward = np.zeros([Sx, Sy, P, G]) 17 | # reward = np.zeros([P, G]) 18 | rewards = np.ones((S,A,P,G)) 19 | rewards *= -2 20 | avg = np.zeros([T,1]) 21 | time_course = np.zeros([T, 3]) 22 | Q = 0.1*np.random.rand(S, A, P, G) 23 | V = np.max(Q, axis=1) 24 | eta = 0.1 25 | gamma = 0.9 26 | epsilon = 0.1 27 | reward_course = np.zeros([T, 1]) 28 | reward_mean = np.zeros([T, 1]) 29 | 30 | 31 | stepsToGoal = np.zeros([T, 1]) 32 | maxV = -9999 33 | for t in range(T): 34 | plocation = 20 35 | pID = [i for i, x in enumerate(pickUps) if x == plocation][0] 36 | Goal = 24 37 | p0 = plocation + 1 38 | gID = [i for i, x in enumerate(pickUps) if x == Goal-1][0] 39 | s0 = np.random.choice(S) 40 | state = [s0, pID, gID] #[{1..25} {1..5} {1..4}] 41 | for u in range(S**4): 42 | if (stepNo > 30): 43 | stepNo = 0 44 | break 45 | 46 | 47 | r = 0 48 | [V[s0, pID, gID], a0] = [np.max(Q[s0, :, pID, gID]), np.argmax(Q[s0, :, pID, gID])] 49 | rewards[s0, a0, pID, gID] = 0 50 | if (np.random.rand(1) < epsilon): 51 | a0 = np.random.choice(A) 52 | 53 | 54 | if a0 == 4: 55 | if pID != 4: 56 | if s0 == pickUps[pID]: 57 | r = 1 58 | rewards[s0, a0, pID, gID] = 1 59 | pID = 4 60 | stepNo = 0 61 | else: 62 | r = -1 63 | rewards[s0, a0, pID, gID] = -1 64 | else: 65 | r = -1 66 | rewards[s0, a0, pID, gID] = -1 67 | 68 | 69 | if a0 == 5: 70 | if (s0 == pickUps[gID]) and pID==4: 71 | stepsToGoal[t] = stepNo 72 | r = 10/float(stepNo) 73 | rewards[s0, a0, pID, gID] = 10/float(stepNo) 74 | if maxR < r: 75 | maxR = r 76 | 77 | 78 | stepNo = 0 79 | else: 80 | r = -1 81 | rewards[s0, a0, pID, gID] = -1 82 | 83 | 84 | if a0 == 0: #nagore 85 | s1 = s0 - Sx 86 | if s1 < 0: 87 | s1 = s1 + Sx 88 | r = -1 89 | rewards[s0, a0, pID, gID] = -1 90 | 91 | 92 | if a0 == 1: 93 | s1 = s0 + Sx #nadolo 94 | if s1 > 24: 95 | s1 = s1 - Sx 96 | r = -1 97 | rewards[s0, a0, pID, gID] = -1 98 | 99 | if a0 == 2: 100 | s1 = s0-1 #nalqvo 101 | if s1==-1 or s1==4 or s1==9 or s1==14 or s1==19: 102 | s1=s1+1 103 | r = -1 104 | rewards[s0, a0, pID, gID] = -1 105 | 106 | if s1==1 or s1==6 or s1==20 or s1==15 or s1==17 or s1==22: 107 | s1 = s1+1 108 | r = -1 109 | rewards[s0, a0, pID, gID] = -1 110 | 111 | if a0 == 3: 112 | s1 = s0 + 1 #nadqsno 113 | if s1 == 5 or s1 == 10 or s1 == 15 or s1 == 20 or s1==25: 114 | s1 = s1 - 1 115 | r = -1 116 | rewards[s0, a0, pID, gID] = -1 117 | 118 | if s1 == 2 or s1 == 7 or s1 == 21 or s1 == 16 or s1 == 18 or s1 == 23: 119 | s1 = s1 - 1 120 | r = -1 121 | rewards[s0, a0, pID, gID] = -1 122 | 123 | 124 | if a0 == 4: 125 | s1 = s0 126 | 127 | 128 | if a0 == 5: #vzemi pacient 129 | s1 = s0 #na gol 130 | 131 | 132 | # learning step 133 | if t > 1000: 134 | R += r 135 | 136 | 137 | # print r 138 | FullR = R + r 139 | reward_course[t] = r 140 | reward_mean[t] = R/float(t+1) 141 | 142 | 143 | V[s1, pID, gID] = np.max(Q[s1, :, pID, gID]) 144 | 145 | 146 | if maxV < V[s1, pID, gID]: 147 | maxV = V[s1, pID, gID] 148 | 149 | 150 | time_course[t, 0] = V[s1, pID, gID] 151 | time_course[t, 1] = eta*(r+gamma*V[s1, pID, gID]) 152 | time_course[t, 2] = (1-eta)*Q[s0, a0, pID, gID] 153 | Q[s0, a0, pID, gID] = (1-eta)*Q[s0, a0, pID, gID] + eta*(r + gamma*V[s1,pID,gID]) 154 | if pID == 4: 155 | stepNo += 1 156 | 157 | 158 | if (s0 == pickUps[gID]) and (a0 == 5) and pID == 4: 159 | stepNo = 0 160 | break 161 | 162 | 163 | s0 = s1 164 | 165 | 166 | avg[t] = np.mean(np.mean(np.mean(np.mean(Q)))) 167 | 168 | 169 | meanR = R/float(T-1000) 170 | fullMR = FullR/float(T) 171 | print(meanR) 172 | print(fullMR) 173 | print(maxV) 174 | policy = [np.max(Q[i, :, pID, gID]) for i in range(S)] 175 | policy_actions = [np.argmax(Q[i, :, pID, gID]) for i in range(S)] 176 | print(len(policy)) 177 | policy_actions = np.reshape(policy_actions, [5, 5]) 178 | policy = np.reshape(policy, [5, 5]) 179 | 180 | for j in range(5): 181 | for i in range(5): 182 | print("{0} ".format(policy_actions[j, i]), end=' ') 183 | 184 | print(" ") 185 | print("REWARDS:") 186 | i = 0 187 | for j in range(25): 188 | # for i in range(6): 189 | print("{0} ".format(rewards[j, i, pID, gID]), end=' ') 190 | 191 | print(" ") -------------------------------------------------------------------------------- /hierarchicalrl/options_maxent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run maximum entropy inverse reinforcement learning on the options gridworld MDP. 3 | 4 | Todor Davchev, 2017 5 | t.b.davchev@ed.ac.uk 6 | """ 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import csv 12 | 13 | import sdp_maxent as maxent 14 | import options_grid_world as options_gridworld 15 | 16 | 17 | def main(grid_size, discount, n_trajectories, epochs, learning_rate): 18 | """ 19 | Run maximum entropy inverse reinforcement learning on the gridworld MDP. 20 | 21 | Plots the reward function. 22 | 23 | grid_size: Grid size. int. 24 | discount: MDP discount factor. float. 25 | n_trajectories: Number of sampled trajectories. int. 26 | epochs: Gradient descent iterations. int. 27 | learning_rate: Gradient descent learning rate. float. 28 | """ 29 | 30 | wind = 0.3 31 | trajectory_length = 3*grid_size/2 32 | 33 | walls = [ 34 | (5, 0), (5, 1), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 10), 35 | (0, 5), (2, 5), (3, 5), (4, 5), 36 | (6, 6), (7, 6), (9, 6), (10, 6) 37 | ] 38 | 39 | options = [ 40 | {'init_set': (1, 5), 'termination': (5, 2), 'room': 0, 'id': 0, 41 | "min": (-1, -1), "max": (5, 5)}, 42 | {'init_set': (5, 2), 'termination': (1, 5), 'room': 0, 'id': 1, 43 | "min": (-1, -1), "max": (5, 5)}, 44 | {'init_set': (5, 2), 'termination': (8, 6), 'room': 1, 'id': 2, 45 | "min": (5, -1), "max": (11, 6)}, 46 | {'init_set': (8, 6), 'termination': (5, 2), 'room': 1, 'id': 3, 47 | "min": (5, -1), "max": (11, 6)}, 48 | {'init_set': (8, 6), 'termination': (5, 9), 'room': 2, 'id': 4, 49 | 'min': (5, 6), 'max': (11, 11)}, 50 | {'init_set': (5, 9), 'termination': (8, 6), 'room': 2, 'id': 5, 51 | 'min': (5, 6), 'max': (11, 11)}, 52 | {'init_set': (5, 9), 'termination': (1, 5), 'room': 3, 'id': 6, 53 | 'min': (-1, 5), "max": (5, 11)}, 54 | {'init_set': (1, 5), 'termination': (5, 9), 'room': 3, 'id': 7, 55 | 'min': (-1, 5), "max": (5, 11)} 56 | ] 57 | 58 | rooms = [ 59 | [ 60 | 0, 1, 2, 3, 4, 61 | 11, 12, 13, 14, 15, 62 | 22, 23, 24, 25, 26, 63 | 33, 34, 35, 36, 37, 64 | 44, 45, 46, 47, 48, 65 | 56, 27 66 | ], 67 | [ 68 | 6, 7, 8, 9, 10, 69 | 17, 18, 19, 20, 21, 70 | 28, 29, 30, 31, 32, 71 | 39, 40, 41, 42, 43, 72 | 50, 51, 52, 53, 54, 73 | 61, 62, 63, 64, 65, 74 | 74, 27 75 | ], 76 | [ 77 | 83, 84, 85, 86, 87, 78 | 94, 95, 96, 97, 98, 79 | 105, 106, 107, 108, 109, 80 | 116, 117, 118, 119, 120, 81 | 104, 74 82 | ], 83 | [ 84 | 66, 67, 68, 69, 70, 85 | 77, 78, 79, 80, 81, 86 | 88, 89, 90, 91, 92, 87 | 99, 100, 101, 102, 103, 88 | 110, 111, 112, 113, 114, 89 | 56, 104 90 | ] 91 | ] 92 | g_world = options_gridworld.Large_Gridworld(grid_size, walls, options, rooms, wind, discount) 93 | trajectories = [] 94 | for opt in options: 95 | trajectories.append( 96 | g_world.generate_intra_option_trajectories( 97 | n_trajectories, 98 | trajectory_length, 99 | g_world.intra_option_optimal_policy, 100 | opt)) 101 | 102 | global_trajectories = g_world.generate_option_option_trajectories( 103 | trajectories, n_trajectories, 104 | g_world.option_option_optimal_policy, 105 | g_world.intra_option_optimal_policy) 106 | feature_matrix = g_world.feature_matrix() 107 | option_feature_matrix = g_world.o_feature_matrix() 108 | #the reward needs to be changed not per room but per option.. 109 | ground_r = np.array([g_world.reward(state) for state in range(grid_size**2)]) 110 | ground_opt_r = np.array([g_world.opt_reward(opt) for opt in range(len(options))]) 111 | options_states = [rooms[opts["room"]] for opts in options] 112 | print("Compute the reward.") 113 | reward, o_reward = maxent.irl( 114 | options_states, feature_matrix, 115 | option_feature_matrix, g_world.n_actions, 116 | g_world.n_options, discount, g_world.options_transition_probability, 117 | g_world.improved_transition_probability, trajectories, global_trajectories, 118 | epochs, learning_rate, g_world.int_to_point, options) 119 | result = np.zeros((len(options),grid_size**2)) 120 | option_result = np.zeros(8) 121 | writer = csv.writer(open("results/results.csv", 'w')) 122 | with open("results/opt_results.csv", 'wb') as csvfile: 123 | opt_writer = csv.writer(csvfile) 124 | opt_writer.writerow(o_reward) 125 | 126 | with open("results/results.csv", 'wb') as csvfile: 127 | writer = csv.writer(csvfile) 128 | for o in range(len(options)): 129 | for broi, value in enumerate(options_states[o]): 130 | result[o][value] = reward[o][broi] 131 | writer.writerow(result[o]) 132 | 133 | # plt.savefig('/tmp/test.png') 134 | # plt.subplot(1, 2, 1) 135 | # plt.pcolor(ground_r.reshape((grid_size, grid_size))) 136 | # plt.colorbar() 137 | # plt.title("Groundtruth reward") 138 | # plt.subplot(1, 2, 2) 139 | # plt.pcolor(result[o].reshape((grid_size, grid_size))) 140 | # plt.colorbar() 141 | # plt.title("Recovered reward") 142 | 143 | # with open('thefile.csv', 'rb') as f: 144 | # data = list(csv.reader(f)) 145 | 146 | 147 | plt.subplot(1, 2, 1) 148 | plt.pcolor(ground_opt_r.reshape((4, 2))) 149 | plt.colorbar() 150 | plt.title("Groundtruth reward") 151 | plt.subplot(1, 2, 2) 152 | plt.pcolor(o_reward.reshape((4, 2))) 153 | plt.colorbar() 154 | plt.title("Recovered reward") 155 | plt.show() 156 | 157 | if __name__ == '__main__': 158 | main(11, 0.01, 20, 200, 0.01) 159 | -------------------------------------------------------------------------------- /options-using-q/optionsUsing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | Sx = 13 6 | Sy = 8 7 | S = Sx*Sy 8 | P = 5 # there is a state for being in the taxi 9 | G = 4 10 | R = 0 11 | O = 4 12 | maxR = -999999 13 | hallways = [80, 45, 100] 14 | rooms = [[ 15 | 14, 15, 16, 17, 18, 16 | 27, 28, 29, 30, 31, 17 | 40, 41, 42, 43, 44, 45, 18 | 53, 54, 55, 56, 57, 19 | 66, 67, 68, 69, 70, 20 | 80 21 | ], 22 | [ 23 | 20, 21, 22, 23, 24, 24 | 33, 34, 35, 36, 37, 25 | 45, 46, 47, 48, 49, 50, 26 | 59, 60, 61, 62, 63, 27 | 72, 73, 74, 75, 76, 28 | 85, 86, 87, 88, 89, 29 | 100 30 | ]] 31 | 32 | walls = [[ 33 | 0, 1, 2, 3, 4, 5, 6, 34 | 13, 26, 39, 52, 65, 35 | 78, 79, 81, 82, 83, 36 | 84, 71, 58, 32, 19, 37 | 93 38 | ], 39 | [ 40 | 7, 8, 9, 10, 11, 12, 41 | 25, 38, 51, 64, 77, 90, 103, 42 | 98, 99, 101, 102, 103, 43 | 97, 84, 71, 58, 32, 19, 6, 44 | 113 45 | ]] 46 | 47 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3] 48 | pickUps = [80, 45, 45, 100] 49 | A = 6 50 | T = 3000 51 | stepNo = 0 52 | avg_reward = np.zeros([Sx, Sy, P, G]) 53 | reward = np.zeros([P, G]) 54 | avg = np.zeros([T,1]) 55 | time_course = np.zeros([T, 3]) 56 | option = 2 57 | option_goal = [45, 80, 100, 45] 58 | room_no = [0, 0, 1, 1] 59 | Q = 0.1*np.random.rand(S, O, A, P, G) 60 | for i in xrange(S): 61 | for o in xrange(O): 62 | for a in xrange(A): 63 | for p in xrange(P): 64 | for g in xrange(G): 65 | if i not in rooms[room_no[o]]: 66 | Q[i, o, a, p, g] = 0 67 | 68 | V = [np.max(Q[:, o, :, :, :], axis=1) for o in xrange(O)] 69 | eta = 0.1 70 | gamma = 0.9 71 | epsilon = 0.1 72 | reward_course = np.zeros([T, 1]) 73 | reward_mean = np.zeros([T, 1]) 74 | 75 | stepsToGoal = np.zeros([T, 1]) 76 | maxV = -9999 77 | for option in [0, 2]: 78 | for t in xrange(T): 79 | plocation = pickUps[option] 80 | pID = [i for i, x in enumerate(pickUps) if x == plocation][0] 81 | if pID == 1: 82 | pID = 2 # fix logic issue 83 | Goal = option_goal[option] 84 | p0 = plocation 85 | gID = [i for i, x in enumerate(pickUps) if x == Goal][0] 86 | s0 = np.random.choice([state for state in xrange(S) if state in rooms[room_no[option]]]) 87 | state = [s0, pID, gID] #[{1..25} {1..5} {1..4}] 88 | for u in xrange(S**2): 89 | if (stepNo > 30): 90 | stepNo = 0 91 | break 92 | 93 | r = 0 94 | [V[option][s0, pID, gID], a0] = [np.max(Q[s0, option, :, pID, gID]), np.argmax(Q[s0, option, :, pID, gID])] 95 | if (np.random.rand(1) < epsilon): 96 | a0 = np.random.choice(A) 97 | 98 | 99 | if a0 == 4: 100 | if pID != 4: 101 | if s0 == pickUps[pID]: 102 | r = 1 103 | pID = 4 104 | stepNo = 0 105 | else: 106 | r = -1 107 | else: 108 | r = -1 109 | 110 | 111 | if a0 == 5: 112 | if (s0 == pickUps[gID]) and pID==4: 113 | stepsToGoal[t] = stepNo 114 | r = 10/float(stepNo) 115 | if maxR < r: 116 | maxR = r 117 | 118 | stepNo = 0 119 | else: 120 | r = -1 121 | 122 | 123 | if a0 == 0: 124 | s1 = s0 - Sx 125 | if s1 not in rooms[room_no[option]]: 126 | s1 = s1 + Sx 127 | r = -1 128 | 129 | 130 | if a0 == 1: 131 | s1 = s0 + Sx 132 | if s1 not in rooms[room_no[option]]: 133 | s1 = s1 - Sx 134 | r = -1 135 | 136 | 137 | if a0 == 2: 138 | s1 = s0 - 1 139 | if s1 not in rooms[room_no[option]]: 140 | s1 = s1 + 1 141 | r = -1 142 | 143 | 144 | if a0 == 3: 145 | s1 = s0 + 1 146 | if s1 not in rooms[room_no[option]]: 147 | s1 = s1 - 1 148 | r = -1 149 | 150 | if a0 == 4: 151 | s1 = s0 152 | 153 | 154 | if a0 == 5: 155 | s1 = s0 156 | 157 | 158 | # learning step 159 | if t > 1000: 160 | R += r 161 | 162 | 163 | # print r 164 | FullR = R + r 165 | reward_course[t] = r 166 | reward_mean[t] = R/float(t+1) 167 | 168 | 169 | V[option][s1, pID, gID] = np.max(Q[s1, option, :, pID, gID]) 170 | 171 | 172 | if maxV < V[option][s1, pID, gID]: 173 | maxV = V[option][s1, pID, gID] 174 | 175 | 176 | time_course[t, 0] = V[option][s1, pID, gID] 177 | time_course[t, 1] = eta*(r+gamma*V[option][s1, pID, gID]) 178 | time_course[t, 2] = (1-eta)*Q[s0, option, a0, pID, gID] 179 | Q[s0, option, a0, pID, gID] = (1-eta)*Q[s0, option, a0, pID, gID] + \ 180 | eta*(r + gamma*V[option][s1, pID, gID]) 181 | if pID == 4: 182 | stepNo += 1 183 | 184 | 185 | if (s0 == pickUps[gID]) and (a0 == 5): 186 | stepNo = 0 187 | break 188 | 189 | 190 | s0 = s1 191 | 192 | 193 | avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :])))) 194 | 195 | 196 | meanR = R/float(T-1000) 197 | fullMR = FullR/float(T) 198 | print meanR 199 | print fullMR 200 | print maxV 201 | policy = [np.max(Q[i, option, :, gID]) for i in xrange(S)] 202 | # value_0 = [V[0][state, 1] for state in xrange(S)] 203 | # value_2 = [V[1][state, 3] for state in xrange(S)] 204 | # visited_states = np.reshape(visited_states, [8, 13]) 205 | policy_actions_0 = [np.argmax(Q[i, 0, :, 4, 1]) for i in xrange(S)] 206 | policy_actions_2 = [np.argmax(Q[i, 2, :, 4, 3]) for i in xrange(S)] 207 | print len(policy) 208 | policy_actions_0 = np.reshape(policy_actions_0, [8, 13]) 209 | policy_actions_2 = np.reshape(policy_actions_2, [8, 13]) 210 | # value_0 = np.reshape(value_0, [8, 13]) 211 | # value_2 = np.reshape(value_2, [8, 13]) 212 | policy = np.reshape(policy, [8, 13]) 213 | 214 | for j in xrange(8): 215 | for i in xrange(13): 216 | print "%d " % int(policy_actions_0[j, i]), 217 | 218 | print " " 219 | print "-------------------------------------" 220 | for j in xrange(8): 221 | for i in xrange(13): 222 | print "%d " % int(policy_actions_2[j, i]), 223 | 224 | print " " 225 | -------------------------------------------------------------------------------- /options-using-q/optionsUsing-nopid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | 5 | 6 | Sx = 13 7 | Sy = 8 8 | S = Sx*Sy 9 | # P = 5 # there is a state for being in the taxi 10 | G = 4 11 | R = 0 12 | O = 4 13 | maxR = -999999 14 | hallways = [80, 45, 100] 15 | rooms = [[ 16 | 14, 15, 16, 17, 18, 17 | 27, 28, 29, 30, 31, 18 | 40, 41, 42, 43, 44, 45, 19 | 53, 54, 55, 56, 57, 20 | 66, 67, 68, 69, 70, 21 | 80 22 | ], 23 | [ 24 | 20, 21, 22, 23, 24, 25 | 33, 34, 35, 36, 37, 26 | 45, 46, 47, 48, 49, 50, 27 | 59, 60, 61, 62, 63, 28 | 72, 73, 74, 75, 76, 29 | 85, 86, 87, 88, 89, 30 | 100 31 | ]] 32 | 33 | walls = [[ 34 | 0, 1, 2, 3, 4, 5, 6, 35 | 13, 26, 39, 52, 65, 36 | 78, 79, 81, 82, 83, 37 | 84, 71, 58, 32, 19, 38 | 93 39 | ], 40 | [ 41 | 7, 8, 9, 10, 11, 12, 42 | 25, 38, 51, 64, 77, 90, 103, 43 | 98, 99, 101, 102, 103, 44 | 97, 84, 71, 58, 32, 19, 6, 45 | 113 46 | ]] 47 | visited_states = ['r' for _ in xrange(S)] 48 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3] 49 | pickUps = [80, 45, 45, 100] 50 | A = 4#6 51 | T = 30000 52 | stepNo = 0 53 | avg = np.zeros([T,1]) 54 | time_course = np.zeros([T, 3]) 55 | options_used = [] 56 | option = 0 57 | options_used.append(option) 58 | option_goal = [45, 80, 100, 45] 59 | endGoal = option_goal[2] 60 | room_no = [0, 0, 1, 1] 61 | Q = 0.1*np.random.rand(S, O, A, G)#0.1*np.random.rand(S, O, A, P, G) 62 | goalReached = False 63 | for i in xrange(S): 64 | for o in xrange(O): 65 | for a in xrange(A): 66 | # for p in xrange(P): 67 | for g in xrange(G): 68 | if i not in rooms[room_no[o]]: 69 | # Q[i, o, a, p, g] = 0 70 | Q[i, o, a, g] = 0 71 | 72 | V = [np.max(Q[:, o, :], axis=2) for o in xrange(O)] 73 | eta = 0.1 74 | gamma = 0.9 75 | epsilon = 0.1 76 | reward_course = np.zeros([T, 1]) 77 | reward_mean = np.zeros([T, 1]) 78 | 79 | stepsToGoal = np.zeros([T, 1]) 80 | maxV = -9999 81 | switched = False 82 | u=0 83 | for t in xrange(T): 84 | Goal = option_goal[option] 85 | gID = [i for i, x in enumerate(pickUps) if x == Goal][0] 86 | s0 = np.random.choice([state for state in xrange(S) if state in rooms[room_no[option]] and state != option_goal[option]]) 87 | 88 | state = [s0, gID] #[s0, pID, gID] #[{1..25} {1..5} {1..4}] 89 | for u in xrange(S**2): 90 | # print s0 91 | visited_states[s0] = 'g' 92 | 93 | r = 0 94 | [V[option][s0, gID], a0] = [np.max(Q[s0, option, :, gID]), np.argmax(Q[s0, option, :, gID])] 95 | if (np.random.rand(1) < epsilon): 96 | a0 = np.random.choice(A) 97 | 98 | if (s0 == pickUps[gID]): 99 | stepsToGoal[t] = stepNo 100 | if s0 == endGoal: 101 | r = 1 102 | goalReached = True 103 | else: 104 | r = 1 105 | goalReached = True 106 | 107 | if maxR < r: 108 | maxR = r 109 | 110 | stepNo = 0 111 | 112 | if a0 == 0 and not goalReached: 113 | s1 = s0 - Sx 114 | if s1 not in rooms[room_no[option]]: 115 | s1 = s1 + Sx 116 | # r = -1 117 | 118 | 119 | if a0 == 1 and not goalReached: 120 | s1 = s0 + Sx 121 | if s1 not in rooms[room_no[option]]: 122 | s1 = s1 - Sx 123 | # r = -1 124 | 125 | 126 | if a0 == 2 and not goalReached: 127 | s1 = s0 - 1 128 | if s1 not in rooms[room_no[option]]: 129 | s1 = s1 + 1 130 | # r = -1 131 | 132 | 133 | if a0 == 3 and not goalReached: 134 | s1 = s0 + 1 135 | if s1 not in rooms[room_no[option]]: 136 | s1 = s1 - 1 137 | # r = -1 138 | 139 | # learning step 140 | if t > 100: 141 | R += r 142 | 143 | # print r 144 | FullR = R + r 145 | reward_course[t] = r 146 | reward_mean[t] = R/float(t+1) 147 | 148 | V[option][s1, gID] = np.max(Q[s1, option, :, gID]) 149 | 150 | if maxV < V[option][s1, gID]: 151 | maxV = V[option][s1, gID] 152 | 153 | time_course[t, 0] = V[option][s1, gID] 154 | time_course[t, 1] = eta*(r+gamma*V[option][s1, gID]) 155 | time_course[t, 2] = (1-eta)*Q[s0, option, a0, gID] 156 | Q[s0, option, a0, gID] = (1-eta)*Q[s0, option, a0, gID] + \ 157 | eta*(r + gamma*V[option][s1, gID]) 158 | 159 | stepNo += 1 160 | if (s0 == endGoal): 161 | stepNo = 0 162 | option = 0 163 | switched = False 164 | goalReached = False 165 | break 166 | 167 | if (s0 == pickUps[gID]) and s0 != endGoal: 168 | stepNo = 0 169 | option = 2 170 | switched = True 171 | goalReached = False 172 | options_used.append(option) 173 | break 174 | 175 | s0 = s1 176 | 177 | avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :])))) 178 | 179 | 180 | meanR = R/float(T-1000) 181 | fullMR = FullR/float(T) 182 | print meanR 183 | print fullMR 184 | print maxV 185 | # policy = [np.max(Q[i, option, :, pID, gID]) for i in xrange(S)] 186 | # policy_actions_0 = [np.argmax(Q[i, 0, :, pID, gID]) for i in xrange(S)] 187 | # policy_actions_2 = [np.argmax(Q[i, 2, :, pID, gID]) for i in xrange(S)] 188 | policy = [np.max(Q[i, option, :, gID]) for i in xrange(S)] 189 | value_0 = [V[0][state, 1] for state in xrange(S)] 190 | value_2 = [V[1][state, 3] for state in xrange(S)] 191 | visited_states = np.reshape(visited_states, [8, 13]) 192 | policy_actions_0 = [np.argmax(Q[i, 0, :, 1]) for i in xrange(S)] 193 | policy_actions_2 = [np.argmax(Q[i, 2, :, 3]) for i in xrange(S)] 194 | print len(policy) 195 | policy_actions_0 = np.reshape(policy_actions_0, [8, 13]) 196 | policy_actions_2 = np.reshape(policy_actions_2, [8, 13]) 197 | value_0 = np.reshape(value_0, [8, 13]) 198 | value_2 = np.reshape(value_2, [8, 13]) 199 | policy = np.reshape(policy, [8, 13]) 200 | 201 | for j in xrange(8): 202 | for i in xrange(13): 203 | print "%d " % int(policy_actions_0[j, i]), 204 | 205 | print " " 206 | print "-------------------------------------" 207 | for j in xrange(8): 208 | for i in xrange(13): 209 | print "%d " % int(policy_actions_2[j, i]), 210 | 211 | print " " 212 | print "-------------------------------------" 213 | for j in xrange(8): 214 | for i in xrange(13): 215 | print "%s " % visited_states[j, i], 216 | 217 | print " " 218 | # for j in xrange(8): 219 | # for i in xrange(13): 220 | # print "{0} ".format(int(policy[j, i])), 221 | 222 | # print " " -------------------------------------------------------------------------------- /options-using-q/basicOption-tworooms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | Sx = 13 6 | Sy = 8 7 | S = Sx*Sy 8 | P = 5 # there is a state for being in the taxi 9 | G = 4 10 | R = 0 11 | O = 4 12 | maxR = -999999 13 | hallways = [80, 45, 100] 14 | rooms = [[ 15 | 14, 15, 16, 17, 18, 16 | 27, 28, 29, 30, 31, 17 | 40, 41, 42, 43, 44, 45, 18 | 53, 54, 55, 56, 57, 19 | 66, 67, 68, 69, 70, 20 | 80 21 | ], 22 | [ 23 | 20, 21, 22, 23, 24, 24 | 33, 34, 35, 36, 37, 25 | 45, 46, 47, 48, 49, 50, 26 | 59, 60, 61, 62, 63, 27 | 72, 73, 74, 75, 76, 28 | 85, 86, 87, 88, 89, 29 | 100 30 | ]] 31 | 32 | walls = [[ 33 | 0, 1, 2, 3, 4, 5, 6, 34 | 13, 26, 39, 52, 65, 35 | 78, 79, 81, 82, 83, 36 | 84, 71, 58, 32, 19, 37 | 93 38 | ], 39 | [ 40 | 7, 8, 9, 10, 11, 12, 41 | 25, 38, 51, 64, 77, 90, 103, 42 | 98, 99, 101, 102, 103, 43 | 97, 84, 71, 58, 32, 19, 6, 44 | 113 45 | ]] 46 | 47 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3] 48 | pickUps = [80, 45, 45, 100] 49 | A = 6 50 | T = 30000 51 | stepNo = 0 52 | avg_reward = np.zeros([Sx, Sy, P, G]) 53 | reward = np.zeros([P, G]) 54 | avg = np.zeros([T,1]) 55 | time_course = np.zeros([T, 3]) 56 | options_used = [] 57 | option = 0 58 | options_used.append(option) 59 | option_goal = [45, 80, 100, 45] 60 | endGoal = option_goal[2] 61 | room_no = [0, 0, 1, 1] 62 | Q = 0.1*np.random.rand(S, O, A, P, G) 63 | for i in xrange(S): 64 | for o in xrange(O): 65 | for a in xrange(A): 66 | for p in xrange(P): 67 | for g in xrange(G): 68 | if i not in rooms[room_no[o]]: 69 | Q[i, o, a, p, g] = 0 70 | 71 | V = [np.max(Q[:, o, :], axis=2) for o in xrange(O)] 72 | eta = 0.1 73 | gamma = 0.9 74 | epsilon = 0.1 75 | reward_course = np.zeros([T, 1]) 76 | reward_mean = np.zeros([T, 1]) 77 | 78 | stepsToGoal = np.zeros([T, 1]) 79 | maxV = -9999 80 | switched = False 81 | for t in xrange(T): 82 | Goal = option_goal[option] 83 | gID = [i for i, x in enumerate(pickUps) if x == Goal][0] 84 | if not switched: 85 | plocation = pickUps[option] 86 | p0 = plocation 87 | pID = [i for i, x in enumerate(pickUps) if x == plocation][0] 88 | s0 = np.random.choice([state for state in xrange(S) if state in rooms[room_no[option]]]) 89 | 90 | state = [s0, pID, gID] #[{1..25} {1..5} {1..4}] 91 | for u in xrange(S**2): 92 | 93 | if (stepNo > 30): 94 | stepNo = 0 95 | break 96 | 97 | r = 0 98 | [V[option][s0, pID, gID], a0] = [np.max(Q[s0, option, :, pID, gID]), np.argmax(Q[s0, option, :, pID, gID])] 99 | if (np.random.rand(1) < epsilon): 100 | a0 = np.random.choice(A) 101 | 102 | if a0 == 4: 103 | if pID != 4: 104 | if s0 == pickUps[pID]: 105 | r = 1 106 | pID = 4 107 | stepNo = 0 108 | else: 109 | r = -1 110 | else: 111 | r = -1 112 | 113 | 114 | if a0 == 5: 115 | if (s0 == pickUps[gID]) and pID==4: 116 | stepsToGoal[t] = stepNo 117 | if stepNo > 0: 118 | r = 10 119 | else: 120 | r = 10 121 | if maxR < r: 122 | maxR = r 123 | 124 | stepNo = 0 125 | else: 126 | r = -1 127 | 128 | 129 | if a0 == 0: 130 | s1 = s0 - Sx 131 | if s1 not in rooms[room_no[option]]: 132 | s1 = s1 + Sx 133 | r = -1 134 | 135 | 136 | if a0 == 1: 137 | s1 = s0 + Sx 138 | if s1 not in rooms[room_no[option]]: 139 | s1 = s1 - Sx 140 | r = -1 141 | 142 | 143 | if a0 == 2: 144 | s1 = s0 - 1 145 | if s1 not in rooms[room_no[option]]: 146 | s1 = s1 + 1 147 | r = -1 148 | 149 | 150 | if a0 == 3: 151 | s1 = s0 + 1 152 | if s1 not in rooms[room_no[option]]: 153 | s1 = s1 - 1 154 | r = -1 155 | 156 | if a0 == 4: 157 | s1 = s0 158 | 159 | 160 | if a0 == 5: 161 | s1 = s0 162 | 163 | 164 | # learning step 165 | if t > 1000: 166 | R += r 167 | 168 | 169 | # print r 170 | FullR = R + r 171 | reward_course[t] = r 172 | reward_mean[t] = R/float(t+1) 173 | 174 | 175 | V[option][s1, pID, gID] = np.max(Q[s1, option, :, pID, gID]) 176 | 177 | 178 | if maxV < V[option][s1, pID, gID]: 179 | maxV = V[option][s1, pID, gID] 180 | 181 | 182 | time_course[t, 0] = V[option][s1, pID, gID] 183 | time_course[t, 1] = eta*(r+gamma*V[option][s1, pID, gID]) 184 | time_course[t, 2] = (1-eta)*Q[s0, option, a0, pID, gID] 185 | Q[s0, option, a0, pID, gID] = (1-eta)*Q[s0, option, a0, pID, gID] + \ 186 | eta*(r + gamma*V[option][s1, pID, gID]) 187 | if pID == 4: 188 | stepNo += 1 189 | 190 | if (s0 == endGoal) and (a0 == 5) and (pID == 4): 191 | stepNo = 0 192 | option = 0 193 | switched = False 194 | # print "Final Goal achieved!!" 195 | break 196 | 197 | if (s0 == pickUps[gID]) and (a0 == 5) and (pID == 4) and s0 != endGoal: 198 | stepNo = 0 199 | # print "---> ", 200 | # print option 201 | option = 2 202 | switched = True 203 | options_used.append(option) 204 | break 205 | 206 | s0 = s1 207 | 208 | 209 | avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :])))) 210 | 211 | 212 | meanR = R/float(T-1000) 213 | fullMR = FullR/float(T) 214 | print meanR 215 | print fullMR 216 | print maxV 217 | policy = [np.max(Q[i, option, :, pID, gID]) for i in xrange(S)] 218 | policy_actions_0 = [np.argmax(Q[i, 0, :, pID, gID]) for i in xrange(S)] 219 | policy_actions_2 = [np.argmax(Q[i, 2, :, pID, gID]) for i in xrange(S)] 220 | print len(policy) 221 | policy_actions_0 = np.reshape(policy_actions_0, [8, 13]) 222 | policy_actions_2 = np.reshape(policy_actions_2, [8, 13]) 223 | policy = np.reshape(policy, [8, 13]) 224 | 225 | for j in xrange(8): 226 | for i in xrange(13): 227 | print "{0} ".format(policy_actions_0[j, i]), 228 | 229 | print " " 230 | print "-------------------------------------" 231 | for j in xrange(8): 232 | for i in xrange(13): 233 | print "{0} ".format(policy_actions_2[j, i]), 234 | 235 | print " " 236 | # for j in xrange(8): 237 | # for i in xrange(13): 238 | # print "{0} ".format(int(policy[j, i])), 239 | 240 | # print " " -------------------------------------------------------------------------------- /hierarchicalrl/optionsUsing-nopid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | 5 | 6 | Sx = 13 7 | Sy = 8 8 | S = Sx*Sy 9 | # P = 5 # there is a state for being in the taxi 10 | G = 4 11 | R = 0 12 | O = 8 13 | maxR = -999999 14 | hallways = [80, 45, 100, 136] 15 | rooms = [[ 16 | 14, 15, 16, 17, 18, 17 | 27, 28, 29, 30, 31, 18 | 40, 41, 42, 43, 44, 45, 19 | 53, 54, 55, 56, 57, 20 | 66, 67, 68, 69, 70, 21 | 80 22 | ], 23 | [ 24 | 20, 21, 22, 23, 24, 25 | 33, 34, 35, 36, 37, 26 | 45, 46, 47, 48, 49, 50, 27 | 59, 60, 61, 62, 63, 28 | 72, 73, 74, 75, 76, 29 | 85, 86, 87, 88, 89, 30 | 100 31 | ], 32 | [ 33 | 111, 112, 113, 114, 115, 34 | 124, 125, 126, 127, 128, 35 | 137, 138, 139, 140, 141, 36 | 150, 151, 152, 153, 154, 37 | 100, 136 38 | ], 39 | [ 40 | 92, 93, 94, 95, 96, 41 | 105, 106, 107, 108, 109, 42 | 118, 119, 120, 121, 122, 43 | 131, 132, 133, 134, 135, 44 | 144, 145, 146, 147, 148, 45 | 80, 136 46 | ]] 47 | 48 | walls = [[ 49 | 0, 1, 2, 3, 4, 5, 6, 50 | 13, 26, 39, 52, 65, 51 | 78, 79, 81, 82, 83, 52 | 84, 71, 58, 32, 19, 53 | 93 54 | ], 55 | [ 56 | 7, 8, 9, 10, 11, 12, 57 | 25, 38, 51, 64, 77, 90, 103, 58 | 98, 99, 101, 102, 103, 59 | 97, 84, 71, 58, 32, 19, 6, 60 | 113 61 | ], 62 | [ 63 | 103, 116, 129, 142, 155, 168, 97, 64 | 110, 123, 149, 65 | 162, 98, 99, 101, 102, 66 | 163, 164, 165, 166, 167 67 | ], 68 | [ 69 | 78, 70 | 91, 71 | 104, 72 | 117, 73 | 130, 74 | 143, 75 | 156, 76 | 84, 77 | 97, 78 | 110, 79 | 123, 80 | 149, 157, 158, 159, 160, 161, 81 | 162, 79, 81, 82, 83 82 | ]] 83 | visited_states = ['r' for _ in xrange(S)] 84 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3] 85 | pickUps = [80, 45, 100, 136] 86 | A = 4#6 87 | T = 30000 88 | stepNo = 0 89 | avg = np.zeros([T,1]) 90 | time_course = np.zeros([T, 3]) 91 | options_used = [] 92 | option = 3 93 | options_used.append(option) 94 | option_goal = [45, 80, 100, 136] 95 | endGoal = option_goal[option]#[2] 96 | room_no = [0, 0, 1, 1, 2, 2, 3, 3] 97 | Q = 0.1*np.random.rand(S, O, A, G)#0.1*np.random.rand(S, O, A, P, G) 98 | goalReached = False 99 | for i in xrange(S): 100 | for o in xrange(O): 101 | for a in xrange(A): 102 | # for p in xrange(P): 103 | for g in xrange(G): 104 | if i not in rooms[room_no[o]]: 105 | # Q[i, o, a, p, g] = 0 106 | Q[i, o, a, g] = 0 107 | 108 | V = [np.max(Q[:, o, :], axis=2) for o in xrange(O)] 109 | eta = 0.1 110 | gamma = 0.9 111 | epsilon = 0.1 112 | reward_course = np.zeros([T, 1]) 113 | reward_mean = np.zeros([T, 1]) 114 | 115 | stepsToGoal = np.zeros([T, 1]) 116 | maxV = -9999 117 | switched = False 118 | u=0 119 | for t in xrange(T): 120 | Goal = option_goal[option] 121 | gID = [i for i, x in enumerate(pickUps) if x == Goal][0] 122 | s0 = np.random.choice([state for state in xrange(S) if state in rooms[room_no[option]] and state != option_goal[option]]) 123 | 124 | state = [s0, gID] #[s0, pID, gID] #[{1..25} {1..5} {1..4}] 125 | for u in xrange(S**2): 126 | # print s0 127 | visited_states[s0] = 'g' 128 | 129 | r = 0 130 | [V[option][s0, gID], a0] = [np.max(Q[s0, option, :, gID]), np.argmax(Q[s0, option, :, gID])] 131 | if (np.random.rand(1) < epsilon): 132 | a0 = np.random.choice(A) 133 | 134 | if (s0 == pickUps[gID]): 135 | stepsToGoal[t] = stepNo 136 | if s0 == endGoal: 137 | r = 1 138 | goalReached = True 139 | else: 140 | r = 1 141 | goalReached = True 142 | 143 | if maxR < r: 144 | maxR = r 145 | 146 | stepNo = 0 147 | 148 | if a0 == 0 and not goalReached: 149 | s1 = s0 - Sx 150 | if s1 not in rooms[room_no[option]]: 151 | s1 = s1 + Sx 152 | # r = -1 153 | 154 | 155 | if a0 == 1 and not goalReached: 156 | s1 = s0 + Sx 157 | if s1 not in rooms[room_no[option]]: 158 | s1 = s1 - Sx 159 | # r = -1 160 | 161 | 162 | if a0 == 2 and not goalReached: 163 | s1 = s0 - 1 164 | if s1 not in rooms[room_no[option]]: 165 | s1 = s1 + 1 166 | # r = -1 167 | 168 | 169 | if a0 == 3 and not goalReached: 170 | s1 = s0 + 1 171 | if s1 not in rooms[room_no[option]]: 172 | s1 = s1 - 1 173 | # r = -1 174 | 175 | # learning step 176 | if t > 100: 177 | R += r 178 | 179 | # print r 180 | FullR = R + r 181 | reward_course[t] = r 182 | reward_mean[t] = R/float(t+1) 183 | 184 | V[option][s1, gID] = np.max(Q[s1, option, :, gID]) 185 | 186 | if maxV < V[option][s1, gID]: 187 | maxV = V[option][s1, gID] 188 | 189 | time_course[t, 0] = V[option][s1, gID] 190 | time_course[t, 1] = eta*(r+gamma*V[option][s1, gID]) 191 | time_course[t, 2] = (1-eta)*Q[s0, option, a0, gID] 192 | Q[s0, option, a0, gID] = (1-eta)*Q[s0, option, a0, gID] + \ 193 | eta*(r + gamma*V[option][s1, gID]) 194 | 195 | stepNo += 1 196 | if (s0 == endGoal): 197 | # print "tuk sam" 198 | stepNo = 0 199 | # option = 2 200 | switched = False 201 | goalReached = False 202 | break 203 | 204 | # if (s0 == pickUps[gID]) and s0 != endGoal: 205 | # stepNo = 0 206 | # option = 0#2 207 | # switched = True 208 | # goalReached = False 209 | # options_used.append(option) 210 | # break 211 | 212 | s0 = s1 213 | 214 | avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :])))) 215 | 216 | 217 | meanR = R/float(T-1000) 218 | fullMR = FullR/float(T) 219 | print meanR 220 | print fullMR 221 | print maxV 222 | # policy = [np.max(Q[i, option, :, pID, gID]) for i in xrange(S)] 223 | # policy_actions_0 = [np.argmax(Q[i, 0, :, pID, gID]) for i in xrange(S)] 224 | # policy_actions_2 = [np.argmax(Q[i, 2, :, pID, gID]) for i in xrange(S)] 225 | policy = [np.max(Q[i, option, :, gID]) for i in xrange(S)] 226 | value_0 = [V[0][state, 1] for state in xrange(S)] 227 | value_2 = [V[1][state, 3] for state in xrange(S)] 228 | visited_states = np.reshape(visited_states, [8, 13]) 229 | policy_actions_0 = [np.argmax(Q[i, 2, :, 0]) for i in xrange(S)] 230 | policy_actions_2 = [np.argmax(Q[i, option, :, gID]) for i in xrange(S)] 231 | print len(policy) 232 | policy_actions_0 = np.reshape(policy_actions_0, [8, 13]) 233 | policy_actions_2 = np.reshape(policy_actions_2, [8, 13]) 234 | value_0 = np.reshape(value_0, [8, 13]) 235 | value_2 = np.reshape(value_2, [8, 13]) 236 | policy = np.reshape(policy, [8, 13]) 237 | 238 | for j in xrange(8): 239 | for i in xrange(13): 240 | print "%d " % int(policy_actions_0[j, i]), 241 | 242 | print " " 243 | print "-------------------------------------" 244 | for j in xrange(8): 245 | for i in xrange(13): 246 | print "%d " % int(policy_actions_2[j, i]), 247 | 248 | print " " 249 | print "-------------------------------------" 250 | for j in xrange(8): 251 | for i in xrange(13): 252 | print "%s " % visited_states[j, i], 253 | 254 | print " " 255 | # for j in xrange(8): 256 | # for i in xrange(13): 257 | # print "{0} ".format(int(policy[j, i])), 258 | 259 | # print " " -------------------------------------------------------------------------------- /irl/mdp/objectworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements the objectworld MDP described in Levine et al. 2011. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import math 9 | from itertools import product 10 | 11 | import numpy as np 12 | import numpy.random as rn 13 | 14 | from irl.mdp.gridworld import Gridworld 15 | 16 | class OWObject(object): 17 | """ 18 | Object in objectworld. 19 | """ 20 | 21 | def __init__(self, inner_colour, outer_colour): 22 | """ 23 | inner_colour: Inner colour of object. int. 24 | outer_colour: Outer colour of object. int. 25 | -> OWObject 26 | """ 27 | 28 | self.inner_colour = inner_colour 29 | self.outer_colour = outer_colour 30 | 31 | def __str__(self): 32 | """ 33 | A string representation of this object. 34 | 35 | -> __str__ 36 | """ 37 | 38 | return "".format(self.inner_colour, 39 | self.outer_colour) 40 | 41 | class Objectworld(Gridworld): 42 | """ 43 | Objectworld MDP. 44 | """ 45 | 46 | def __init__(self, grid_size, n_objects, n_colours, wind, discount): 47 | """ 48 | grid_size: Grid size. int. 49 | n_objects: Number of objects in the world. int. 50 | n_colours: Number of colours to colour objects with. int. 51 | wind: Chance of moving randomly. float. 52 | discount: MDP discount. float. 53 | -> Objectworld 54 | """ 55 | 56 | super(Objectworld, self).__init__(grid_size, wind, discount) 57 | 58 | self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1), (0, 0)) 59 | self.n_actions = len(self.actions) 60 | self.n_objects = n_objects 61 | self.n_colours = n_colours 62 | 63 | # Generate objects. 64 | self.objects = {} 65 | for _ in range(self.n_objects): 66 | obj = OWObject(rn.randint(self.n_colours), 67 | rn.randint(self.n_colours)) 68 | 69 | while True: 70 | x = rn.randint(self.grid_size) 71 | y = rn.randint(self.grid_size) 72 | 73 | if (x, y) not in self.objects: 74 | break 75 | 76 | self.objects[x, y] = obj 77 | 78 | # Preconstruct the transition probability array. 79 | self.transition_probability = np.array( 80 | [[[self._transition_probability(i, j, k) 81 | for k in range(self.n_states)] 82 | for j in range(self.n_actions)] 83 | for i in range(self.n_states)]) 84 | 85 | def feature_vector(self, i, discrete=True): 86 | """ 87 | Get the feature vector associated with a state integer. 88 | 89 | i: State int. 90 | discrete: Whether the feature vectors should be discrete (default True). 91 | bool. 92 | -> Feature vector. 93 | """ 94 | 95 | sx, sy = self.int_to_point(i) 96 | 97 | nearest_inner = {} # colour: distance 98 | nearest_outer = {} # colour: distance 99 | 100 | for y in range(self.grid_size): 101 | for x in range(self.grid_size): 102 | if (x, y) in self.objects: 103 | dist = math.hypot((x - sx), (y - sy)) 104 | obj = self.objects[x, y] 105 | if obj.inner_colour in nearest_inner: 106 | if dist < nearest_inner[obj.inner_colour]: 107 | nearest_inner[obj.inner_colour] = dist 108 | else: 109 | nearest_inner[obj.inner_colour] = dist 110 | if obj.outer_colour in nearest_outer: 111 | if dist < nearest_outer[obj.outer_colour]: 112 | nearest_outer[obj.outer_colour] = dist 113 | else: 114 | nearest_outer[obj.outer_colour] = dist 115 | 116 | # Need to ensure that all colours are represented. 117 | for c in range(self.n_colours): 118 | if c not in nearest_inner: 119 | nearest_inner[c] = 0 120 | if c not in nearest_outer: 121 | nearest_outer[c] = 0 122 | 123 | if discrete: 124 | state = np.zeros((2*self.n_colours*self.grid_size,)) 125 | i = 0 126 | for c in range(self.n_colours): 127 | for d in range(1, self.grid_size+1): 128 | if nearest_inner[c] < d: 129 | state[i] = 1 130 | i += 1 131 | if nearest_outer[c] < d: 132 | state[i] = 1 133 | i += 1 134 | assert i == 2*self.n_colours*self.grid_size 135 | assert (state >= 0).all() 136 | else: 137 | # Continuous features. 138 | state = np.zeros((2*self.n_colours)) 139 | i = 0 140 | for c in range(self.n_colours): 141 | state[i] = nearest_inner[c] 142 | i += 1 143 | state[i] = nearest_outer[c] 144 | i += 1 145 | 146 | return state 147 | 148 | def feature_matrix(self, discrete=True): 149 | """ 150 | Get the feature matrix for this objectworld. 151 | 152 | discrete: Whether the feature vectors should be discrete (default True). 153 | bool. 154 | -> NumPy array with shape (n_states, n_states). 155 | """ 156 | 157 | return np.array([self.feature_vector(i, discrete) 158 | for i in range(self.n_states)]) 159 | 160 | def reward(self, state_int): 161 | """ 162 | Get the reward for a state int. 163 | 164 | state_int: State int. 165 | -> reward float 166 | """ 167 | 168 | x, y = self.int_to_point(state_int) 169 | 170 | near_c0 = False 171 | near_c1 = False 172 | for (dx, dy) in product(range(-3, 4), range(-3, 4)): 173 | if 0 <= x + dx < self.grid_size and 0 <= y + dy < self.grid_size: 174 | if (abs(dx) + abs(dy) <= 3 and 175 | (x+dx, y+dy) in self.objects and 176 | self.objects[x+dx, y+dy].outer_colour == 0): 177 | near_c0 = True 178 | if (abs(dx) + abs(dy) <= 2 and 179 | (x+dx, y+dy) in self.objects and 180 | self.objects[x+dx, y+dy].outer_colour == 1): 181 | near_c1 = True 182 | 183 | if near_c0 and near_c1: 184 | return 1 185 | if near_c0: 186 | return -1 187 | return 0 188 | 189 | def generate_trajectories(self, n_trajectories, trajectory_length, policy): 190 | """ 191 | Generate n_trajectories trajectories with length trajectory_length. 192 | 193 | n_trajectories: Number of trajectories. int. 194 | trajectory_length: Length of an episode. int. 195 | policy: Map from state integers to action integers. 196 | -> [[(state int, action int, reward float)]] 197 | """ 198 | 199 | return super(Objectworld, self).generate_trajectories(n_trajectories, trajectory_length, 200 | policy, 201 | True) 202 | 203 | def optimal_policy(self, state_int): 204 | raise NotImplementedError( 205 | "Optimal policy is not implemented for Objectworld.") 206 | def optimal_policy_deterministic(self, state_int): 207 | raise NotImplementedError( 208 | "Optimal policy is not implemented for Objectworld.") 209 | -------------------------------------------------------------------------------- /options-using-q/options-temp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | Sx = 5 6 | Sy = 5 7 | S = Sx*Sy 8 | P = 5 # there is a state for being in the taxi 9 | G = 4 10 | R = 0 11 | maxR = -999999 12 | maxRo = -999999 13 | pickUps = [0, Sx-1, S-Sx, S-2] 14 | O = 7 15 | A = 6 16 | T = 100000 17 | k_step = 0 18 | stepNo = 0 19 | stepNo_o = 0 20 | avg_reward = np.zeros([Sx, Sy, P, G]) 21 | reward = np.zeros([P, G]) 22 | avg = np.zeros([T,1]) 23 | time_course = np.zeros([T, 3]) 24 | Q = 0.1*np.random.rand(S, O, P, G) 25 | opt_one_policy = 0.1*np.random.rand(S, A, P, G) 26 | V = np.max(Q, axis=1) 27 | V_o = np.max(opt_one_policy, axis=1) 28 | eta = 0.1 29 | gamma = 0.9 30 | epsilon = 0.3 31 | reward_course = np.zeros([T, 1]) 32 | reward_mean = np.zeros([T, 1]) 33 | shouldBreak = False 34 | 35 | stop = True 36 | stepsToGoal = np.zeros([T, 1]) 37 | maxV = -9999 38 | maxV_o = -9999 39 | for t in xrange(T): 40 | plocation = 20 41 | pID = [i for i, x in enumerate(pickUps) if x == plocation][0] 42 | Goal = 24 43 | p0 = plocation + 1 44 | gID = [i for i, x in enumerate(pickUps) if x == Goal-1][0] 45 | s0 = np.random.choice(S) 46 | state = [s0, pID, gID] #[{1..25} {1..5} {1..4}] 47 | for u in xrange(S**4): 48 | if (stepNo > 30): 49 | stepNo = 0 50 | stepNo_o = 0 51 | break 52 | 53 | r = 0 54 | if stop: #V[s1, pID, gID] = np.max(Q[s1, :, pID, gID]) 55 | # if o0 == 6: # if previous time had the option and now time's gone 56 | # V[s1, pID, gID] = np.max(Q[s1, :, pID, gID]) 57 | [V[s0, pID, gID], o0] = [np.max(Q[s0, :, pID, gID]), np.argmax(Q[s0, :, pID, gID])] 58 | 59 | if (np.random.rand(1) < epsilon): 60 | o0 = np.random.choice(O) 61 | 62 | if o0 == 6: 63 | r_o = 0 64 | k_step = 1 65 | stop = False 66 | [V_o[s0, pID, gID], a0] = [np.max(opt_one_policy[s0, :, pID, gID]), 67 | np.argmax(opt_one_policy[s0, :, pID, gID])] 68 | else: 69 | k_step += 1 70 | [V_o[s0, pID, gID], a0] = [ 71 | np.max(opt_one_policy[s0, :, pID, gID]), np.argmax(opt_one_policy[s0, :, pID, gID])] 72 | 73 | if o0 == 4: 74 | if pID != 4: 75 | if s0 == pickUps[pID]: 76 | r = 1 77 | pID = 4 78 | stepNo = 0 79 | else: 80 | r = -1 81 | else: 82 | r = -1 83 | 84 | if o0 == 5: 85 | if (s0 == pickUps[gID]) and pID==4: 86 | stepsToGoal[t] = stepNo 87 | r = 10/float(stepNo) 88 | if maxR < r: 89 | maxR = r 90 | 91 | stepNo = 0 92 | else: 93 | r = -1 94 | 95 | if o0 == 0: 96 | s1 = s0 - Sx 97 | if s1 < 0: 98 | s1 = s1 + Sx 99 | r = -1 100 | 101 | if o0 == 1: 102 | s1 = s0 + Sx 103 | if s1 > 24: 104 | s1 = s1 - Sx 105 | r = -1 106 | 107 | if o0 == 2: 108 | s1=s0-1 109 | if s1==-1 or s1==4 or s1==9 or s1==14 or s1==19: 110 | s1=s1+1 111 | r = -1 112 | 113 | if s1==1 or s1==6 or s1==20 or s1==15 or s1==17 or s1==22: 114 | s1 = s1+1 115 | r = -1 116 | 117 | if o0 == 3: 118 | s1 = s0 + 1 119 | if s1 == 5 or s1 == 10 or s1 == 15 or s1 == 20 or s1==25: 120 | s1 = s1 - 1 121 | r = -1 122 | 123 | if s1 == 2 or s1 == 7 or s1 == 21 or s1 == 16 or s1 == 18 or s1 == 23: 124 | s1 = s1 - 1 125 | r = -1 126 | 127 | if o0 == 6: 128 | if pID != 4: 129 | s1 = s0 130 | stop = True 131 | else: 132 | stepNo_o += 1 133 | if a0 == 4: 134 | r_o = -1 135 | 136 | if a0 == 5: 137 | if (s0 == 16): # 19 is a random state I would like my option to get to 138 | # stepsToGoal[t] = stepNo 139 | r_o = 1 140 | if maxRo < r_o: 141 | maxRo = r_o 142 | 143 | stop = True 144 | stepNo_o = 0 145 | # elif (s0 == pickUps[gID]): 146 | # # stepsToGoal[t] = stepNo 147 | # r_o = 10/float(stepNo) 148 | # if maxR_o < r_o: 149 | # maxR_o = r_o 150 | 151 | # stop = True 152 | # stepNo = 0 153 | else: 154 | r_o = -1 155 | 156 | if a0 == 0: 157 | s1 = s0 - Sx 158 | if s1 < 0: 159 | s1 = s1 + Sx 160 | r_o = -1 161 | 162 | if a0 == 1: 163 | s1 = s0 + Sx 164 | if s1 > 24: 165 | s1 = s1 - Sx 166 | r_o = -1 167 | 168 | if a0 == 2: 169 | s1=s0-1 170 | if s1==-1 or s1==4 or s1==9 or s1==14 or s1==19: 171 | s1=s1+1 172 | r_o = -1 173 | 174 | if s1==1 or s1==6 or s1==20 or s1==15 or s1==17 or s1==22: 175 | s1 = s1+1 176 | r_o = -1 177 | 178 | if a0 == 3: 179 | s1 = s0 + 1 180 | if s1 == 5 or s1 == 10 or s1 == 15 or s1 == 20 or s1==25: 181 | s1 = s1 - 1 182 | r_o = -1 183 | 184 | if s1 == 2 or s1 == 7 or s1 == 21 or s1 == 16 or s1 == 18 or s1 == 23: 185 | s1 = s1 - 1 186 | r_o = -1 187 | 188 | if a0 == 4: 189 | s1 = s0 190 | 191 | if a0 == 5: 192 | s1 = s0 193 | 194 | if (s0 == pickUps[gID]) and (a0 == 5): 195 | shouldBreak = True 196 | 197 | if o0 == 4: 198 | s1 = s0 199 | 200 | if o0 == 5: 201 | s1 = s0 202 | 203 | if stop: 204 | # print "tuk sam" 205 | if o0 == 6: 206 | r = r_o 207 | # print r 208 | # learning step 209 | if t > 1000: 210 | R += r 211 | 212 | # print r 213 | FullR = R + r 214 | reward_course[t] = r 215 | reward_mean[t] = R/float(t+1) 216 | 217 | V[s1, pID, gID] = np.max(Q[s1, :, pID, gID]) 218 | 219 | if maxV < V[s1, pID, gID]: 220 | maxV = V[s1, pID, gID] 221 | 222 | time_course[t, 0] = V[s1, pID, gID] 223 | time_course[t, 1] = eta*(r+gamma*V[s1, pID, gID]) 224 | time_course[t, 2] = (1-eta)*Q[s0, o0, pID, gID] 225 | 226 | Q[s0, o0, pID, gID] = (1-eta)*Q[s0, o0, pID, gID] + eta*(r + gamma*V[s1, pID, gID]) 227 | else: 228 | V_o[s1, pID, gID] = np.max(opt_one_policy[s1, :, pID, gID]) 229 | opt_one_policy[s0, a0, pID, gID] = (1-eta)*opt_one_policy[s0, a0, pID, gID] +\ 230 | eta*(r + gamma*V_o[s1, pID, gID]) 231 | if maxV_o < V_o[s1, pID, gID]: 232 | maxV_o = V_o[s1, pID, gID] 233 | 234 | if pID == 4: 235 | stepNo += 1 236 | 237 | if (s0 == pickUps[gID]) and (o0 == 5): 238 | stepNo = 0 239 | break 240 | 241 | 242 | s0 = s1 243 | 244 | 245 | avg[t] = np.mean(np.mean(np.mean(np.mean(Q)))) 246 | 247 | 248 | meanR = R/float(T-1000) 249 | fullMR = FullR/float(T) 250 | print meanR 251 | print fullMR 252 | print maxV 253 | print "Policy" 254 | policy = [np.max(Q[i, :, pID, gID]) for i in xrange(S)] 255 | policy_actions = [np.argmax(Q[i, :, pID, gID]) for i in xrange(S)] 256 | policy_inoption = [np.argmax(opt_one_policy[i, :, pID, gID]) for i in xrange(S)] 257 | print len(policy) 258 | policy_actions = np.reshape(policy_actions, [5, 5]) 259 | policy_inoption = np.reshape(policy_inoption, [5, 5]) 260 | 261 | 262 | for i in xrange(5): 263 | for j in xrange(5): 264 | print "{0} ".format(policy_actions[i, j]), 265 | 266 | 267 | print " " 268 | 269 | print "In option actions" 270 | for i in xrange(5): 271 | for j in xrange(5): 272 | print "{0} ".format(policy_inoption[i, j]), 273 | 274 | print " " 275 | -------------------------------------------------------------------------------- /irl/linear_irl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements LP IRL from Ng & Russell, 2000. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import random 9 | 10 | import numpy as np 11 | from cvxopt import matrix, solvers 12 | 13 | def irl(n_states, n_actions, transition_probability, policy, discount, Rmax, 14 | l1): 15 | """ 16 | Find a reward function with inverse RL as described in Ng & Russell, 2000. 17 | 18 | n_states: Number of states. int. 19 | n_actions: Number of actions. int. 20 | transition_probability: NumPy array mapping (state_i, action, state_k) to 21 | the probability of transitioning from state_i to state_k under action. 22 | Shape (N, A, N). 23 | policy: Vector mapping state ints to action ints. Shape (N,). 24 | discount: Discount factor. float. 25 | Rmax: Maximum reward. float. 26 | l1: l1 regularisation. float. 27 | -> Reward vector 28 | """ 29 | 30 | A = set(range(n_actions)) # Set of actions to help manage reordering 31 | # actions. 32 | # The transition policy convention is different here to the rest of the code 33 | # for legacy reasons; here, we reorder axes to fix this. We expect the 34 | # new probabilities to be of the shape (A, N, N). 35 | transition_probability = np.transpose(transition_probability, (1, 0, 2)) 36 | 37 | def T(a, s): 38 | """ 39 | Shorthand for a dot product used a lot in the LP formulation. 40 | """ 41 | 42 | return np.dot(transition_probability[policy[s], s] - 43 | transition_probability[a, s], 44 | np.linalg.inv(np.eye(n_states) - 45 | discount*transition_probability[policy[s]])) 46 | 47 | # This entire function just computes the block matrices used for the LP 48 | # formulation of IRL. 49 | 50 | # Minimise c . x. 51 | c = -np.hstack([np.zeros(n_states), np.ones(n_states), 52 | -l1*np.ones(n_states)]) 53 | zero_stack1 = np.zeros((n_states*(n_actions-1), n_states)) 54 | T_stack = np.vstack([ 55 | -T(a, s) 56 | for s in range(n_states) 57 | for a in A - {policy[s]} 58 | ]) 59 | I_stack1 = np.vstack([ 60 | np.eye(1, n_states, s) 61 | for s in range(n_states) 62 | for a in A - {policy[s]} 63 | ]) 64 | I_stack2 = np.eye(n_states) 65 | zero_stack2 = np.zeros((n_states, n_states)) 66 | 67 | D_left = np.vstack([T_stack, T_stack, -I_stack2, I_stack2]) 68 | D_middle = np.vstack([I_stack1, zero_stack1, zero_stack2, zero_stack2]) 69 | D_right = np.vstack([zero_stack1, zero_stack1, -I_stack2, -I_stack2]) 70 | 71 | D = np.hstack([D_left, D_middle, D_right]) 72 | b = np.zeros((n_states*(n_actions-1)*2 + 2*n_states, 1)) 73 | bounds = np.array([(None, None)]*2*n_states + [(-Rmax, Rmax)]*n_states) 74 | 75 | # We still need to bound R. To do this, we just add 76 | # -I R <= Rmax 1 77 | # I R <= Rmax 1 78 | # So to D we need to add -I and I, and to b we need to add Rmax 1 and Rmax 1 79 | D_bounds = np.hstack([ 80 | np.vstack([ 81 | -np.eye(n_states), 82 | np.eye(n_states)]), 83 | np.vstack([ 84 | np.zeros((n_states, n_states)), 85 | np.zeros((n_states, n_states))]), 86 | np.vstack([ 87 | np.zeros((n_states, n_states)), 88 | np.zeros((n_states, n_states))])]) 89 | b_bounds = np.vstack([Rmax*np.ones((n_states, 1))]*2) 90 | D = np.vstack((D, D_bounds)) 91 | b = np.vstack((b, b_bounds)) 92 | A_ub = matrix(D) 93 | b = matrix(b) 94 | c = matrix(c) 95 | results = solvers.lp(c, A_ub, b) 96 | r = np.asarray(results["x"][:n_states], dtype=np.double) 97 | 98 | return r.reshape((n_states,)) 99 | 100 | def v_tensor(value, transition_probability, feature_dimension, n_states, 101 | n_actions, policy): 102 | """ 103 | Finds the v tensor used in large linear IRL. 104 | 105 | value: NumPy matrix for the value function. The (i, j)th component 106 | represents the value of the jth state under the ith basis function. 107 | transition_probability: NumPy array mapping (state_i, action, state_k) to 108 | the probability of transitioning from state_i to state_k under action. 109 | Shape (N, A, N). 110 | feature_dimension: Dimension of the feature matrix. int. 111 | n_states: Number of states sampled. int. 112 | n_actions: Number of actions. int. 113 | policy: NumPy array mapping state ints to action ints. 114 | -> v helper tensor. 115 | """ 116 | 117 | v = np.zeros((n_states, n_actions-1, feature_dimension)) 118 | for i in range(n_states): 119 | a1 = policy[i] 120 | exp_on_policy = np.dot(transition_probability[i, a1], value.T) 121 | seen_policy_action = False 122 | for j in range(n_actions): 123 | # Skip this if it's the on-policy action. 124 | if a1 == j: 125 | seen_policy_action = True 126 | continue 127 | 128 | exp_off_policy = np.dot(transition_probability[i, j], value.T) 129 | if seen_policy_action: 130 | v[i, j-1] = exp_on_policy - exp_off_policy 131 | else: 132 | v[i, j] = exp_on_policy - exp_off_policy 133 | return v 134 | 135 | def large_irl(value, transition_probability, feature_matrix, n_states, 136 | n_actions, policy): 137 | """ 138 | Find the reward in a large state space. 139 | 140 | value: NumPy matrix for the value function. The (i, j)th component 141 | represents the value of the jth state under the ith basis function. 142 | transition_probability: NumPy array mapping (state_i, action, state_k) to 143 | the probability of transitioning from state_i to state_k under action. 144 | Shape (N, A, N). 145 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 146 | array with shape (N, D) where N is the number of states and D is the 147 | dimensionality of the state. 148 | n_states: Number of states sampled. int. 149 | n_actions: Number of actions. int. 150 | policy: NumPy array mapping state ints to action ints. 151 | -> Reward for each state in states. 152 | """ 153 | 154 | D = feature_matrix.shape[1] 155 | 156 | # First, calculate v, which is just a helper tensor. 157 | v = v_tensor(value, transition_probability, D, n_states, n_actions, policy) 158 | 159 | # Now we can calculate c, G, h, A, and b. 160 | 161 | # x = [z y_i^+ y_i^- a], which is a [N (K-1)*N (K-1)*N D] vector. 162 | x_size = n_states + (n_actions-1)*n_states*2 + D 163 | 164 | # c is a big stack of ones and zeros; there's N ones and the rest is zero. 165 | c = -np.hstack([np.ones(n_states), np.zeros(x_size - n_states)]) 166 | assert c.shape[0] == x_size 167 | 168 | # A is [0 I_j -I_j -v^T_{ij}] and j NOT EQUAL TO policy(i). 169 | # I believe this is accounted for by the structure of v. 170 | A = np.hstack([ 171 | np.zeros((n_states*(n_actions-1), n_states)), 172 | np.eye(n_states*(n_actions-1)), 173 | -np.eye(n_states*(n_actions-1)), 174 | np.vstack([v[i, j].T for i in range(n_states) 175 | for j in range(n_actions-1)])]) 176 | assert A.shape[1] == x_size 177 | 178 | # b is just zeros! 179 | b = np.zeros(A.shape[0]) 180 | 181 | # Break G up into the bottom row and other rows to construct it. 182 | bottom_row = np.vstack([ 183 | np.hstack([ 184 | np.ones((n_actions-1, 1)).dot(np.eye(1, n_states, l)), 185 | np.hstack([-np.eye(n_actions-1) if i == l 186 | else np.zeros((n_actions-1, n_actions-1)) 187 | for i in range(n_states)]), 188 | np.hstack([2*np.eye(n_actions-1) if i == l 189 | else np.zeros((n_actions-1, n_actions-1)) 190 | for i in range(n_states)]), 191 | np.zeros((n_actions-1, D))]) 192 | for l in range(n_states)]) 193 | assert bottom_row.shape[1] == x_size 194 | G = np.vstack([ 195 | np.hstack([ 196 | np.zeros((D, n_states)), 197 | np.zeros((D, n_states*(n_actions-1))), 198 | np.zeros((D, n_states*(n_actions-1))), 199 | np.eye(D)]), 200 | np.hstack([ 201 | np.zeros((D, n_states)), 202 | np.zeros((D, n_states*(n_actions-1))), 203 | np.zeros((D, n_states*(n_actions-1))), 204 | -np.eye(D)]), 205 | np.hstack([ 206 | np.zeros((n_states*(n_actions-1), n_states)), 207 | -np.eye(n_states*(n_actions-1)), 208 | np.zeros((n_states*(n_actions-1), n_states*(n_actions-1))), 209 | np.zeros((n_states*(n_actions-1), D))]), 210 | np.hstack([ 211 | np.zeros((n_states*(n_actions-1), n_states)), 212 | np.zeros((n_states*(n_actions-1), n_states*(n_actions-1))), 213 | -np.eye(n_states*(n_actions-1)), 214 | np.zeros((n_states*(n_actions-1), D))]), 215 | bottom_row]) 216 | assert G.shape[1] == x_size 217 | 218 | h = np.vstack([np.ones((D*2, 1)), 219 | np.zeros((n_states*(n_actions-1)*2+bottom_row.shape[0], 1))]) 220 | 221 | from cvxopt import matrix, solvers 222 | c = matrix(c) 223 | G = matrix(G) 224 | h = matrix(h) 225 | A = matrix(A) 226 | b = matrix(b) 227 | results = solvers.lp(c, G, h, A, b) 228 | alpha = np.asarray(results["x"][-D:], dtype=np.double) 229 | return np.dot(feature_matrix, -alpha) 230 | -------------------------------------------------------------------------------- /irl/maxent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements maximum entropy inverse reinforcement learning (Ziebart et al., 2008) 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | from itertools import product 9 | 10 | import numpy as np 11 | import numpy.random as rn 12 | 13 | from . import value_iteration 14 | 15 | def irl(feature_matrix, n_actions, discount, transition_probability, 16 | trajectories, epochs, learning_rate): 17 | """ 18 | Find the reward function for the given trajectories. 19 | 20 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 21 | array with shape (N, D) where N is the number of states and D is the 22 | dimensionality of the state. 23 | n_actions: Number of actions A. int. 24 | discount: Discount factor of the MDP. float. 25 | transition_probability: NumPy array mapping (state_i, action, state_k) to 26 | the probability of transitioning from state_i to state_k under action. 27 | Shape (N, A, N). 28 | trajectories: 3D array of state/action pairs. States are ints, actions 29 | are ints. NumPy array with shape (T, L, 2) where T is the number of 30 | trajectories and L is the trajectory length. 31 | epochs: Number of gradient descent steps. int. 32 | learning_rate: Gradient descent learning rate. float. 33 | -> Reward vector with shape (N,). 34 | """ 35 | 36 | n_states, d_states = feature_matrix.shape 37 | 38 | # Initialise weights. 39 | alpha = rn.uniform(size=(d_states,)) 40 | 41 | # Calculate the feature expectations \tilde{phi}. 42 | feature_expectations = find_feature_expectations(feature_matrix, 43 | trajectories) 44 | 45 | # Gradient descent on alpha. 46 | for i in range(epochs): 47 | # print("i: {}".format(i)) 48 | r = feature_matrix.dot(alpha) 49 | expected_svf = find_expected_svf(n_states, r, n_actions, discount, 50 | transition_probability, trajectories) 51 | grad = feature_expectations - feature_matrix.T.dot(expected_svf) 52 | 53 | alpha += learning_rate * grad 54 | 55 | return feature_matrix.dot(alpha).reshape((n_states,)) 56 | 57 | def find_svf(n_states, trajectories): 58 | """ 59 | Find the state visitation frequency from trajectories. 60 | 61 | n_states: Number of states. int. 62 | trajectories: 3D array of state/action pairs. States are ints, actions 63 | are ints. NumPy array with shape (T, L, 2) where T is the number of 64 | trajectories and L is the trajectory length. 65 | -> State visitation frequencies vector with shape (N,). 66 | """ 67 | 68 | svf = np.zeros(n_states) 69 | 70 | for trajectory in trajectories: 71 | for state, _, _ in trajectory: 72 | svf[state] += 1 73 | 74 | svf /= trajectories.shape[0] 75 | 76 | return svf 77 | 78 | def find_feature_expectations(feature_matrix, trajectories): 79 | """ 80 | Find the feature expectations for the given trajectories. This is the 81 | average path feature vector. 82 | 83 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 84 | array with shape (N, D) where N is the number of states and D is the 85 | dimensionality of the state. 86 | trajectories: 3D array of state/action pairs. States are ints, actions 87 | are ints. NumPy array with shape (T, L, 2) where T is the number of 88 | trajectories and L is the trajectory length. 89 | -> Feature expectations vector with shape (D,). 90 | """ 91 | 92 | feature_expectations = np.zeros(feature_matrix.shape[1]) 93 | 94 | for trajectory in trajectories: 95 | for state, _, _ in trajectory: 96 | feature_expectations += feature_matrix[state] 97 | 98 | feature_expectations /= trajectories.shape[0] 99 | 100 | return feature_expectations 101 | 102 | def find_expected_svf(n_states, r, n_actions, discount, 103 | transition_probability, trajectories): 104 | """ 105 | Find the expected state visitation frequencies using algorithm 1 from 106 | Ziebart et al. 2008. 107 | 108 | n_states: Number of states N. int. 109 | alpha: Reward. NumPy array with shape (N,). 110 | n_actions: Number of actions A. int. 111 | discount: Discount factor of the MDP. float. 112 | transition_probability: NumPy array mapping (state_i, action, state_k) to 113 | the probability of transitioning from state_i to state_k under action. 114 | Shape (N, A, N). 115 | trajectories: 3D array of state/action pairs. States are ints, actions 116 | are ints. NumPy array with shape (T, L, 2) where T is the number of 117 | trajectories and L is the trajectory length. 118 | -> Expected state visitation frequencies vector with shape (N,). 119 | """ 120 | 121 | n_trajectories = trajectories.shape[0] 122 | trajectory_length = trajectories.shape[1] 123 | 124 | # policy = find_policy(n_states, r, n_actions, discount, 125 | # transition_probability) 126 | policy = value_iteration.find_policy(n_states, n_actions, 127 | transition_probability, r, discount) 128 | 129 | start_state_count = np.zeros(n_states) 130 | for trajectory in trajectories: 131 | start_state_count[trajectory[0, 0]] += 1 132 | p_start_state = start_state_count/n_trajectories 133 | 134 | expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T 135 | for t in range(1, trajectory_length): 136 | expected_svf[:, t] = 0 137 | for i, j, k in product(range(n_states), range(n_actions), range(n_states)): 138 | expected_svf[k, t] += (expected_svf[i, t-1] * 139 | policy[i, j] * # Stochastic policy 140 | transition_probability[i, j, k]) 141 | 142 | return expected_svf.sum(axis=1) 143 | 144 | def softmax(x1, x2): 145 | """ 146 | Soft-maximum calculation, from algorithm 9.2 in Ziebart's PhD thesis. 147 | 148 | x1: float. 149 | x2: float. 150 | -> softmax(x1, x2) 151 | """ 152 | 153 | max_x = max(x1, x2) 154 | min_x = min(x1, x2) 155 | return max_x + np.log(1 + np.exp(min_x - max_x)) 156 | 157 | def find_policy(n_states, r, n_actions, discount, 158 | transition_probability): 159 | """ 160 | Find a policy with linear value iteration. Based on the code accompanying 161 | the Levine et al. GPIRL paper and on Ziebart's PhD thesis (algorithm 9.1). 162 | 163 | n_states: Number of states N. int. 164 | r: Reward. NumPy array with shape (N,). 165 | n_actions: Number of actions A. int. 166 | discount: Discount factor of the MDP. float. 167 | transition_probability: NumPy array mapping (state_i, action, state_k) to 168 | the probability of transitioning from state_i to state_k under action. 169 | Shape (N, A, N). 170 | -> NumPy array of states and the probability of taking each action in that 171 | state, with shape (N, A). 172 | """ 173 | 174 | # V = value_iteration.value(n_states, transition_probability, r, discount) 175 | 176 | # NumPy's dot really dislikes using inf, so I'm making everything finite 177 | # using nan_to_num. 178 | V = np.nan_to_num(np.ones((n_states, 1)) * float("-inf")) 179 | 180 | diff = np.ones((n_states,)) 181 | while (diff > 1e-4).all(): # Iterate until convergence. 182 | new_V = r.copy() 183 | for j in range(n_actions): 184 | for i in range(n_states): 185 | new_V[i] = softmax(new_V[i], r[i] + discount* 186 | np.sum(transition_probability[i, j, k] * V[k] 187 | for k in range(n_states))) 188 | 189 | # # This seems to diverge, so we z-score it (engineering hack). 190 | new_V = (new_V - new_V.mean())/new_V.std() 191 | 192 | diff = abs(V - new_V) 193 | V = new_V 194 | 195 | # We really want Q, not V, so grab that using equation 9.2 from the thesis. 196 | Q = np.zeros((n_states, n_actions)) 197 | for i in range(n_states): 198 | for j in range(n_actions): 199 | p = np.array([transition_probability[i, j, k] 200 | for k in range(n_states)]) 201 | Q[i, j] = p.dot(r + discount*V) 202 | 203 | # Softmax by row to interpret these values as probabilities. 204 | Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability. 205 | Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1)) 206 | return Q 207 | 208 | def expected_value_difference(n_states, n_actions, transition_probability, 209 | reward, discount, p_start_state, optimal_value, true_reward): 210 | """ 211 | Calculate the expected value difference, which is a proxy to how good a 212 | recovered reward function is. 213 | 214 | n_states: Number of states. int. 215 | n_actions: Number of actions. int. 216 | transition_probability: NumPy array mapping (state_i, action, state_k) to 217 | the probability of transitioning from state_i to state_k under action. 218 | Shape (N, A, N). 219 | reward: Reward vector mapping state int to reward. Shape (N,). 220 | discount: Discount factor. float. 221 | p_start_state: Probability vector with the ith component as the probability 222 | that the ith state is the start state. Shape (N,). 223 | optimal_value: Value vector for the ground reward with optimal policy. 224 | The ith component is the value of the ith state. Shape (N,). 225 | true_reward: True reward vector. Shape (N,). 226 | -> Expected value difference. float. 227 | """ 228 | 229 | policy = value_iteration.find_policy(n_states, n_actions, 230 | transition_probability, reward, discount) 231 | value = value_iteration.value(policy.argmax(axis=1), n_states, 232 | transition_probability, true_reward, discount) 233 | 234 | evd = optimal_value.dot(p_start_state) - value.dot(p_start_state) 235 | return evd 236 | -------------------------------------------------------------------------------- /irl/mdp/gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements the gridworld MDP. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | import numpy as np 9 | import numpy.random as rn 10 | 11 | class Gridworld(object): 12 | """ 13 | Gridworld MDP. 14 | """ 15 | 16 | def __init__(self, grid_size, wind, discount): 17 | """ 18 | grid_size: Grid size. int. 19 | wind: Chance of moving randomly. float. 20 | discount: MDP discount. float. 21 | -> Gridworld 22 | """ 23 | 24 | self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1)) 25 | self.n_actions = len(self.actions) 26 | self.n_states = grid_size**2 27 | self.grid_size = grid_size 28 | self.wind = wind 29 | self.discount = discount 30 | 31 | # Preconstruct the transition probability array. 32 | self.transition_probability = np.array( 33 | [[[self._transition_probability(i, j, k) 34 | for k in range(self.n_states)] 35 | for j in range(self.n_actions)] 36 | for i in range(self.n_states)]) 37 | 38 | def __str__(self): 39 | return "Gridworld({}, {}, {})".format(self.grid_size, self.wind, 40 | self.discount) 41 | 42 | def feature_vector(self, i, feature_map="ident"): 43 | """ 44 | Get the feature vector associated with a state integer. 45 | 46 | i: State int. 47 | feature_map: Which feature map to use (default ident). String in {ident, 48 | coord, proxi}. 49 | -> Feature vector. 50 | """ 51 | 52 | if feature_map == "coord": 53 | f = np.zeros(self.grid_size) 54 | x, y = i % self.grid_size, i // self.grid_size 55 | f[x] += 1 56 | f[y] += 1 57 | return f 58 | if feature_map == "proxi": 59 | f = np.zeros(self.n_states) 60 | x, y = i % self.grid_size, i // self.grid_size 61 | for b in range(self.grid_size): 62 | for a in range(self.grid_size): 63 | dist = abs(x - a) + abs(y - b) 64 | f[self.point_to_int((a, b))] = dist 65 | return f 66 | # Assume identity map. 67 | f = np.zeros(self.n_states) 68 | f[i] = 1 69 | return f 70 | 71 | def feature_matrix(self, feature_map="ident"): 72 | """ 73 | Get the feature matrix for this gridworld. 74 | 75 | feature_map: Which feature map to use (default ident). String in {ident, 76 | coord, proxi}. 77 | -> NumPy array with shape (n_states, d_states). 78 | """ 79 | 80 | features = [] 81 | for n in range(self.n_states): 82 | f = self.feature_vector(n, feature_map) 83 | features.append(f) 84 | return np.array(features) 85 | 86 | def int_to_point(self, i): 87 | """ 88 | Convert a state int into the corresponding coordinate. 89 | 90 | i: State int. 91 | -> (x, y) int tuple. 92 | """ 93 | 94 | return (i % self.grid_size, i // self.grid_size) 95 | 96 | def point_to_int(self, p): 97 | """ 98 | Convert a coordinate into the corresponding state int. 99 | 100 | p: (x, y) tuple. 101 | -> State int. 102 | """ 103 | 104 | return p[0] + p[1]*self.grid_size 105 | 106 | def neighbouring(self, i, k): 107 | """ 108 | Get whether two points neighbour each other. Also returns true if they 109 | are the same point. 110 | 111 | i: (x, y) int tuple. 112 | k: (x, y) int tuple. 113 | -> bool. 114 | """ 115 | 116 | return abs(i[0] - k[0]) + abs(i[1] - k[1]) <= 1 117 | 118 | def _transition_probability(self, i, j, k): 119 | """ 120 | Get the probability of transitioning from state i to state k given 121 | action j. 122 | 123 | i: State int. 124 | j: Action int. 125 | k: State int. 126 | -> p(s_k | s_i, a_j) 127 | """ 128 | 129 | xi, yi = self.int_to_point(i) 130 | xj, yj = self.actions[j] 131 | xk, yk = self.int_to_point(k) 132 | 133 | if not self.neighbouring((xi, yi), (xk, yk)): 134 | return 0.0 135 | 136 | # Is k the intended state to move to? 137 | if (xi + xj, yi + yj) == (xk, yk): 138 | return 1 - self.wind + self.wind/self.n_actions 139 | 140 | # If these are not the same point, then we can move there by wind. 141 | if (xi, yi) != (xk, yk): 142 | return self.wind/self.n_actions 143 | 144 | # If these are the same point, we can only move here by either moving 145 | # off the grid or being blown off the grid. Are we on a corner or not? 146 | if (xi, yi) in {(0, 0), (self.grid_size-1, self.grid_size-1), 147 | (0, self.grid_size-1), (self.grid_size-1, 0)}: 148 | # Corner. 149 | # Can move off the edge in two directions. 150 | # Did we intend to move off the grid? 151 | if not (0 <= xi + xj < self.grid_size and 152 | 0 <= yi + yj < self.grid_size): 153 | # We intended to move off the grid, so we have the regular 154 | # success chance of staying here plus an extra chance of blowing 155 | # onto the *other* off-grid square. 156 | return 1 - self.wind + 2*self.wind/self.n_actions 157 | else: 158 | # We can blow off the grid in either direction only by wind. 159 | return 2*self.wind/self.n_actions 160 | else: 161 | # Not a corner. Is it an edge? 162 | if (xi not in {0, self.grid_size-1} and 163 | yi not in {0, self.grid_size-1}): 164 | # Not an edge. 165 | return 0.0 166 | 167 | # Edge. 168 | # Can only move off the edge in one direction. 169 | # Did we intend to move off the grid? 170 | if not (0 <= xi + xj < self.grid_size and 171 | 0 <= yi + yj < self.grid_size): 172 | # We intended to move off the grid, so we have the regular 173 | # success chance of staying here. 174 | return 1 - self.wind + self.wind/self.n_actions 175 | else: 176 | # We can blow off the grid only by wind. 177 | return self.wind/self.n_actions 178 | 179 | def reward(self, state_int): 180 | """ 181 | Reward for being in state state_int. 182 | 183 | state_int: State integer. int. 184 | -> Reward. 185 | """ 186 | 187 | if state_int == self.n_states - 1: 188 | return 1 189 | return 0 190 | 191 | def average_reward(self, n_trajectories, trajectory_length, policy): 192 | """ 193 | Calculate the average total reward obtained by following a given policy 194 | over n_paths paths. 195 | 196 | policy: Map from state integers to action integers. 197 | n_trajectories: Number of trajectories. int. 198 | trajectory_length: Length of an episode. int. 199 | -> Average reward, standard deviation. 200 | """ 201 | 202 | trajectories = self.generate_trajectories(n_trajectories, 203 | trajectory_length, policy) 204 | rewards = [[r for _, _, r in trajectory] for trajectory in trajectories] 205 | rewards = np.array(rewards) 206 | 207 | # Add up all the rewards to find the total reward. 208 | total_reward = rewards.sum(axis=1) 209 | 210 | # Return the average reward and standard deviation. 211 | return total_reward.mean(), total_reward.std() 212 | 213 | def optimal_policy(self, state_int): 214 | """ 215 | The optimal policy for this gridworld. 216 | 217 | state_int: What state we are in. int. 218 | -> Action int. 219 | """ 220 | 221 | sx, sy = self.int_to_point(state_int) 222 | 223 | if sx < self.grid_size and sy < self.grid_size: 224 | return rn.randint(0, 2) 225 | if sx < self.grid_size-1: 226 | return 0 227 | if sy < self.grid_size-1: 228 | return 1 229 | raise ValueError("Unexpected state.") 230 | 231 | def optimal_policy_deterministic(self, state_int): 232 | """ 233 | Deterministic version of the optimal policy for this gridworld. 234 | 235 | state_int: What state we are in. int. 236 | -> Action int. 237 | """ 238 | 239 | sx, sy = self.int_to_point(state_int) 240 | if sx < sy: 241 | return 0 242 | return 1 243 | 244 | def generate_trajectories(self, n_trajectories, trajectory_length, policy, 245 | random_start=False): 246 | """ 247 | Generate n_trajectories trajectories with length trajectory_length, 248 | following the given policy. 249 | 250 | n_trajectories: Number of trajectories. int. 251 | trajectory_length: Length of an episode. int. 252 | policy: Map from state integers to action integers. 253 | random_start: Whether to start randomly (default False). bool. 254 | -> [[(state int, action int, reward float)]] 255 | """ 256 | keep = [] 257 | trajectories = [] 258 | for _ in range(n_trajectories): 259 | if random_start: 260 | sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size) 261 | else: 262 | sx, sy = 0, 0 263 | 264 | trajectory = [] 265 | for _ in range(trajectory_length): 266 | if rn.random() < self.wind: 267 | action = self.actions[rn.randint(0, 4)] 268 | else: 269 | # Follow the given policy. 270 | keep.append(policy(self.point_to_int((sx, sy)))) 271 | action = self.actions[policy(self.point_to_int((sx, sy)))] 272 | 273 | if (0 <= sx + action[0] < self.grid_size and 274 | 0 <= sy + action[1] < self.grid_size): 275 | next_sx = sx + action[0] 276 | next_sy = sy + action[1] 277 | else: 278 | next_sx = sx 279 | next_sy = sy 280 | 281 | state_int = self.point_to_int((sx, sy)) 282 | action_int = self.actions.index(action) 283 | next_state_int = self.point_to_int((next_sx, next_sy)) 284 | reward = self.reward(next_state_int) 285 | trajectory.append((state_int, action_int, reward)) 286 | 287 | sx = next_sx 288 | sy = next_sy 289 | 290 | trajectories.append(trajectory) 291 | 292 | return np.array(trajectories) 293 | -------------------------------------------------------------------------------- /hierarchicalrl/sdp_value_iteration.py: -------------------------------------------------------------------------------- 1 | """ 2 | Find the value function associated with a policy. Based on Sutton & Barto, 1998. 3 | 4 | Todor Davchev, 2017 5 | t.b.davchev@ed.ac.uk 6 | """ 7 | 8 | import numpy as np 9 | 10 | 11 | def value(policy, n_states, transition_probabilities, reward, discount, 12 | threshold=1e-2): 13 | """ 14 | Find the value function associated with a policy. 15 | 16 | policy: List of action ints for each state. 17 | n_states: Number of states. int. 18 | transition_probabilities: Function taking (state, action, state) to 19 | transition probabilities. 20 | reward: Vector of rewards for each state. 21 | discount: MDP discount factor. float. 22 | threshold: Convergence threshold, default 1e-2. float. 23 | -> Array of values for each state 24 | """ 25 | v = np.zeros(n_states) 26 | 27 | diff = float("inf") 28 | while diff > threshold: 29 | diff = 0 30 | for s in range(n_states): 31 | vs = v[s] 32 | a = policy[s] 33 | v[s] = sum(transition_probabilities[s, a, k] * 34 | (reward[k] + discount * v[k]) 35 | for k in range(n_states)) 36 | diff = max(diff, abs(vs - v[s])) 37 | 38 | return v 39 | 40 | 41 | def optimal_value(option_states, n_actions, transition_probabilities, reward, 42 | discount, threshold=1e-2): 43 | """ 44 | Find the optimal value function. 45 | 46 | n_states: Number of states. int. 47 | n_actions: Number of actions. int. 48 | transition_probabilities: Function taking (state, action, state) to 49 | transition probabilities. 50 | reward: Vector of rewards for each state. 51 | discount: MDP discount factor. float. 52 | threshold: Convergence threshold, default 1e-2. float. 53 | -> Array of values for each state 54 | """ 55 | 56 | value = np.zeros(len(option_states)) 57 | 58 | diff = float("inf") 59 | while diff > threshold: 60 | diff = 0 61 | for idx, state in enumerate(option_states): 62 | max_v = float("-inf") 63 | for action in range(n_actions): 64 | transition_p = transition_probabilities[state, action, :] 65 | transition_p = [ 66 | [ 67 | x for br, x in enumerate(transition_p) if br == opt_state] 68 | for opt_state in option_states] 69 | transition_p = np.asarray(transition_p) 70 | transition_p = np.reshape(transition_p, transition_p.shape[0]) 71 | max_v = max(max_v, sum( 72 | reward + np.dot(transition_p, (discount * value)))) 73 | # max_v = max(max_v, np.dot(tp, reward + discount*v)) 74 | 75 | new_diff = abs(value[idx] - max_v) 76 | if new_diff > diff: 77 | diff = new_diff 78 | value[idx] = max_v 79 | 80 | 81 | # diff = 0 82 | # for s in range(n_states): 83 | # max_v = float("-inf") 84 | # for a in range(n_actions): 85 | # tp = transition_probabilities[s, a, :] 86 | # max_v = max(max_v, np.dot(tp, reward + discount*v)) 87 | 88 | # new_diff = abs(v[s] - max_v) 89 | # if new_diff > diff: 90 | # diff = new_diff 91 | # v[s] = max_v 92 | 93 | 94 | return value 95 | 96 | 97 | def optimal_value_option(options_states, n_options, options_transition_probabilities, 98 | reward_o, discount, threshold=1e-2): 99 | value_o = np.zeros(121) 100 | diff_o = float("inf") 101 | while diff_o > threshold: 102 | diff_o = 0 103 | for state in range(121): 104 | max_vo = float("-inf") 105 | for option in range(8): 106 | transition_po = options_transition_probabilities[state, option, :] 107 | transition_po = np.asarray(transition_po) 108 | transition_po = np.reshape(transition_po, transition_po.shape[0]) 109 | # [filter(lambda x: x in c1, sublist) 110 | # for sublist in c2] 111 | max_vo = max(max_vo, 112 | reward_o[option] + np.dot(transition_po, (discount * value_o))) 113 | # max_v = max(max_v, np.dot(tp, reward + discount*v)) 114 | 115 | new_diff_o = abs(value_o[state] - max_vo) 116 | if new_diff_o > diff_o: 117 | diff_o = new_diff_o 118 | value_o[state] = max_vo 119 | 120 | return value_o 121 | 122 | def optimal_option_value(option_states, n_actions, transition_probabilities, reward, 123 | discount, threshold=1e-2): 124 | """ 125 | Find the optimal value function. 126 | 127 | n_states: Number of states. int. 128 | n_actions: Number of actions. int. 129 | transition_probabilities: Function taking (state, action, state) to 130 | transition probabilities. 131 | reward: Vector of rewards for each state. 132 | discount: MDP discount factor. float. 133 | threshold: Convergence threshold, default 1e-2. float. 134 | -> Array of values for each state 135 | """ 136 | 137 | value = np.zeros(len(option_states)) 138 | 139 | diff = float("inf") 140 | while diff > threshold: 141 | diff = 0 142 | for idx, state in enumerate(option_states): 143 | max_v = float("-inf") 144 | for action in range(n_actions): 145 | transition_p = transition_probabilities[state, action, :] 146 | transition_p = [[x for br, x in enumerate(transition_p) if br == state] for state in option_states] 147 | transition_p = np.asarray(transition_p) 148 | transition_p = np.reshape(transition_p, transition_p.shape[0]) 149 | # [filter(lambda x: x in c1, sublist) 150 | # for sublist in c2] 151 | max_v = max(max_v, sum( 152 | reward + np.dot(transition_p[0], (discount * value)))) 153 | # max_v = max(max_v, np.dot(tp, reward + discount*v)) 154 | 155 | new_diff = abs(value[idx] - max_v) 156 | if new_diff > diff: 157 | diff = new_diff 158 | value[idx] = max_v 159 | 160 | return value 161 | 162 | # def optimal_value(n_states, n_actions, transition_probabilities, reward, 163 | # discount, threshold=1e-2): 164 | # """ 165 | # Find the optimal value function. 166 | 167 | # n_states: Number of states. int. 168 | # n_actions: Number of actions. int. 169 | # transition_probabilities: Function taking (state, action, state) to 170 | # transition probabilities. 171 | # reward: Vector of rewards for each state. 172 | # discount: MDP discount factor. float. 173 | # threshold: Convergence threshold, default 1e-2. float. 174 | # -> Array of values for each state 175 | # """ 176 | 177 | # v = np.zeros(n_states) 178 | 179 | # diff = float("inf") 180 | # while diff > threshold: 181 | # diff = 0 182 | # for s in range(n_states): 183 | # max_v = float("-inf") 184 | # for a in range(n_actions): 185 | # tp = transition_probabilities[s, a, :] 186 | # # max_v = max(max_v, sum(reward + np.dot(tp, discount*v))) 187 | # max_v = max(max_v, np.dot(tp, reward + discount*v)) 188 | 189 | # new_diff = abs(v[s] - max_v) 190 | # if new_diff > diff: 191 | # diff = new_diff 192 | # v[s] = max_v 193 | 194 | # return v 195 | 196 | 197 | def find_option_policy(options_states, n_states, n_actions, n_options, options_transition_probabilities, 198 | transition_probabilities, reward_o, reward, discount, 199 | threshold=1e-2, value=None, stochastic=True): 200 | q_values = []#np.zeros((len(options_states), n_states, n_actions)) 201 | if value is None: 202 | option_value = optimal_value_option(options_states, n_options, 203 | options_transition_probabilities, 204 | reward_o, discount, threshold) 205 | 206 | if stochastic: 207 | options_Q = np.zeros((121, n_options)) 208 | for i in range(121): 209 | for j in range(n_options): 210 | p = options_transition_probabilities[i, j, :] 211 | options_Q[i, j] = reward_o[j] + p.dot(discount*option_value) 212 | options_Q -= options_Q.max(axis=1).reshape((121, 1)) # For numerical stability. 213 | options_Q = np.exp(options_Q)/np.exp(options_Q).sum(axis=1).reshape((121, 1)) 214 | return options_Q 215 | 216 | def find_policy(options_states, n_states, n_actions, n_options, options_transition_probabilities, 217 | transition_probabilities, reward_o, reward, discount, 218 | threshold=1e-2, value=None, stochastic=True): 219 | """ 220 | Find the optimal policy. 221 | 222 | n_states: Number of states. int. 223 | n_actions: Number of actions. int. 224 | transition_probabilities: Function taking (state, action, state) to 225 | transition probabilities. 226 | reward: Vector of rewards for each state. 227 | discount: MDP discount factor. float. 228 | threshold: Convergence threshold, default 1e-2. float. 229 | v: Value function (if known). Default None. 230 | stochastic: Whether the policy should be stochastic. Default True. 231 | -> Action probabilities for each state or action int for each state 232 | (depending on stochasticity). 233 | """ 234 | 235 | q_values = []#np.zeros((len(options_states), n_states, n_actions)) 236 | if value is None: 237 | value = [] 238 | option_value = [] 239 | for option, option_states in enumerate(options_states): 240 | value.append( 241 | optimal_value( 242 | option_states, n_actions, transition_probabilities[option], 243 | reward[option], discount, threshold)) 244 | 245 | if stochastic: 246 | for option, option_states in enumerate(options_states): 247 | q_values.append(np.zeros((len(option_states), n_actions))) 248 | # Get Q using equation 9.2 from Ziebart's thesis. 249 | for idx, i_state in enumerate(option_states): 250 | for j_action in range(n_actions): 251 | transition_p = transition_probabilities[option, i_state, j_action, :] 252 | transition_p = [[ 253 | x for br, x in enumerate(transition_p) if br == opt_state] 254 | for opt_state in option_states] 255 | transition_p = np.asarray(transition_p) 256 | transition_p = np.reshape(transition_p, transition_p.shape[0]) 257 | q_values[option][idx, j_action] = sum(reward[option] + transition_p.dot( 258 | discount * value[option])) 259 | q_values[option] -= q_values[option].max(axis=1).reshape((n_states[option], 1)) 260 | # For numerical stability. 261 | q_values[option] = np.exp( 262 | q_values[option]) / np.exp(q_values[option]).sum(axis=1).reshape( 263 | (n_states[option], 1)) 264 | 265 | # Q = np.zeros((n_states, n_actions)) 266 | # for i in range(n_states): 267 | # for j in range(n_actions): 268 | # p = transition_probabilities[i, j, :] 269 | # Q[i, j] = p.dot(reward + discount*v) 270 | # Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability. 271 | # Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1)) 272 | # return Q 273 | 274 | return q_values 275 | 276 | # def _policy(s): 277 | # return max(range(n_actions), 278 | # key=lambda a: sum(transition_probabilities[s, a, k] * 279 | # (reward[k] + discount * v[k]) 280 | # for k in range(n_states))) 281 | # policy = np.array([_policy(s) for s in range(n_states)]) 282 | # return policy 283 | 284 | 285 | if __name__ == '__main__': 286 | # Quick unit test using gridworld. 287 | import mdp.gridworld as gridworld 288 | gw = gridworld.Gridworld(3, 0.3, 0.9) 289 | v = value([gw.optimal_policy_deterministic(s) for s in range(gw.n_states)], 290 | gw.n_states, 291 | gw.transition_probability, 292 | [gw.reward(s) for s in range(gw.n_states)], 293 | gw.discount) 294 | assert np.isclose(v, 295 | [5.7194282, 6.46706692, 6.42589811, 296 | 6.46706692, 7.47058224, 7.96505174, 297 | 6.42589811, 7.96505174, 8.19268666], 1).all() 298 | opt_v = optimal_value(gw.n_states, 299 | gw.n_actions, 300 | gw.transition_probability, 301 | [gw.reward(s) for s in range(gw.n_states)], 302 | gw.discount) 303 | assert np.isclose(v, opt_v).all() 304 | -------------------------------------------------------------------------------- /irl/deep_maxent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements deep maximum entropy inverse reinforcement learning based on 3 | Ziebart et al., 2008 and Wulfmeier et al., 2015, using symbolic methods with 4 | Theano. 5 | 6 | Matthew Alger, 2015 7 | matthew.alger@anu.edu.au 8 | """ 9 | 10 | from itertools import product 11 | 12 | import numpy as np 13 | import numpy.random as rn 14 | import theano as th 15 | import theano.tensor as T 16 | 17 | from . import maxent 18 | 19 | FLOAT = th.config.floatX 20 | 21 | def find_svf(n_states, trajectories): 22 | """ 23 | Find the state vistiation frequency from trajectories. 24 | 25 | n_states: Number of states. int. 26 | trajectories: 3D array of state/action pairs. States are ints, actions 27 | are ints. NumPy array with shape (T, L, 2) where T is the number of 28 | trajectories and L is the trajectory length. 29 | -> State visitation frequencies vector with shape (N,). 30 | """ 31 | 32 | svf = np.zeros(n_states) 33 | 34 | for trajectory in trajectories: 35 | for state, _, _ in trajectory: 36 | svf[state] += 1 37 | 38 | svf /= trajectories.shape[0] 39 | 40 | return th.shared(svf, "svf", borrow=True) 41 | 42 | def optimal_value(n_states, n_actions, transition_probabilities, reward, 43 | discount, threshold=1e-2): 44 | """ 45 | Find the optimal value function. 46 | 47 | n_states: Number of states. int. 48 | n_actions: Number of actions. int. 49 | transition_probabilities: Function taking (state, action, state) to 50 | transition probabilities. 51 | reward: Vector of rewards for each state. 52 | discount: MDP discount factor. float. 53 | threshold: Convergence threshold, default 1e-2. float. 54 | -> Array of values for each state 55 | """ 56 | 57 | v = T.zeros(n_states, dtype=FLOAT) 58 | 59 | def update(s, prev_diff, v, reward, tps): 60 | max_v = float("-inf") 61 | v_template = T.zeros_like(v) 62 | for a in range(n_actions): 63 | tp = tps[s, a, :] 64 | max_v = T.largest(max_v, T.dot(tp, reward + discount*v)) 65 | new_diff = abs(v[s] - max_v) 66 | if T.lt(prev_diff, new_diff): 67 | diff = new_diff 68 | else: 69 | diff = prev_diff 70 | return (diff, T.set_subtensor(v_template[s], max_v)), {} 71 | 72 | def until_converged(diff, v): 73 | (diff, vs), _ = th.scan( 74 | fn=update, 75 | outputs_info=[{"initial": diff, "taps": [-1]}, 76 | None], 77 | sequences=[T.arange(n_states)], 78 | non_sequences=[v, reward, transition_probabilities]) 79 | return ((diff[-1], vs.sum(axis=0)), {}, 80 | th.scan_module.until(diff[-1] < threshold)) 81 | 82 | (_, vs), _ = th.scan(fn = until_converged, 83 | outputs_info=[ 84 | # Need to force an inf into the right Theano 85 | # data type and this seems to be the only way that 86 | # works. 87 | {"initial": getattr(np, FLOAT)(float("inf")), 88 | "taps": [-1]}, 89 | {"initial": v, 90 | "taps": [-1]}], 91 | n_steps=1000) 92 | 93 | return vs[-1] 94 | 95 | def find_policy(n_states, n_actions, transition_probabilities, reward, discount, 96 | threshold=1e-2, v=None): 97 | """ 98 | Find the optimal policy. 99 | 100 | n_states: Number of states. int. 101 | n_actions: Number of actions. int. 102 | transition_probabilities: Function taking (state, action, state) to 103 | transition probabilities. 104 | reward: Vector of rewards for each state. 105 | discount: MDP discount factor. float. 106 | threshold: Convergence threshold, default 1e-2. float. 107 | v: Optimal value array (if known). Default None. 108 | -> Action probabilities for each state. 109 | """ 110 | 111 | if v is None: 112 | v = optimal_value(n_states, n_actions, transition_probabilities, reward, 113 | discount, threshold) 114 | 115 | # Get Q using equation 9.2 from Ziebart's thesis. 116 | Q = T.zeros((n_states, n_actions)) 117 | def make_Q(i, j, tps, Q, reward, v): 118 | Q_template = T.zeros_like(Q) 119 | tp = transition_probabilities[i, j, :] 120 | return T.set_subtensor(Q_template[i, j], tp.dot(reward + discount*v)),{} 121 | 122 | prod = np.array(list(product(range(n_states), range(n_actions)))) 123 | state_range = th.shared(prod[:, 0]) 124 | action_range = th.shared(prod[:, 1]) 125 | Qs, _ = th.scan(fn=make_Q, 126 | outputs_info=None, 127 | sequences=[state_range, action_range], 128 | non_sequences=[transition_probabilities, Q, reward, v]) 129 | Q = Qs.sum(axis=0) 130 | Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability. 131 | Q = T.exp(Q)/T.exp(Q).sum(axis=1).reshape((n_states, 1)) 132 | return Q 133 | 134 | def find_expected_svf(n_states, r, n_actions, discount, 135 | transition_probability, trajectories): 136 | """ 137 | Find the expected state visitation frequencies using algorithm 1 from 138 | Ziebart et al. 2008. 139 | 140 | n_states: Number of states N. int. 141 | alpha: Reward. NumPy array with shape (N,). 142 | n_actions: Number of actions A. int. 143 | discount: Discount factor of the MDP. float. 144 | transition_probability: NumPy array mapping (state_i, action, state_k) to 145 | the probability of transitioning from state_i to state_k under action. 146 | Shape (N, A, N). 147 | trajectories: 3D array of state/action pairs. States are ints, actions 148 | are ints. NumPy array with shape (T, L, 2) where T is the number of 149 | trajectories and L is the trajectory length. 150 | -> Expected state visitation frequencies vector with shape (N,). 151 | """ 152 | 153 | n_trajectories = trajectories.shape[0] 154 | trajectory_length = trajectories.shape[1] 155 | 156 | policy = find_policy(n_states, n_actions, 157 | transition_probability, r, discount) 158 | 159 | start_state_count = T.extra_ops.bincount(trajectories[:, 0, 0], 160 | minlength=n_states) 161 | p_start_state = start_state_count.astype(FLOAT)/n_trajectories 162 | 163 | def state_visitation_step(i, j, prev_svf, policy, tps): 164 | """ 165 | The sum of the outputs of a scan over this will be a row of the svf. 166 | """ 167 | 168 | svf = prev_svf[i] * policy[i, j] * tps[i, j, :] 169 | return svf, {} 170 | 171 | prod = np.array(list(product(range(n_states), range(n_actions)))) 172 | state_range = th.shared(prod[:, 0]) 173 | action_range = th.shared(prod[:, 1]) 174 | def state_visitation_row(prev_svf, policy, tps, state_range, action_range): 175 | svf_t, _ = th.scan(fn=state_visitation_step, 176 | sequences=[state_range, action_range], 177 | non_sequences=[prev_svf, policy, tps]) 178 | svf_t = svf_t.sum(axis=0) 179 | return svf_t, {} 180 | 181 | svf, _ = th.scan(fn=state_visitation_row, 182 | outputs_info=[{"initial": p_start_state, "taps": [-1]}], 183 | n_steps=trajectories.shape[1]-1, 184 | non_sequences=[policy, transition_probability, state_range, 185 | action_range]) 186 | 187 | return svf.sum(axis=0) + p_start_state 188 | 189 | def irl(structure, feature_matrix, n_actions, discount, transition_probability, 190 | trajectories, epochs, learning_rate, initialisation="normal", l1=0.1, 191 | l2=0.1): 192 | """ 193 | Find the reward function for the given trajectories. 194 | 195 | structure: Neural network structure tuple, e.g. (10, 3, 3) would be a 196 | 3-layer neural network with 10 inputs. 197 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 198 | array with shape (N, D) where N is the number of states and D is the 199 | dimensionality of the state. 200 | n_actions: Number of actions A. int. 201 | discount: Discount factor of the MDP. float. 202 | transition_probability: NumPy array mapping (state_i, action, state_k) to 203 | the probability of transitioning from state_i to state_k under action. 204 | Shape (N, A, N). 205 | trajectories: 3D array of state/action pairs. States are ints, actions 206 | are ints. NumPy array with shape (T, L, 2) where T is the number of 207 | trajectories and L is the trajectory length. 208 | epochs: Number of gradient descent steps. int. 209 | learning_rate: Gradient descent learning rate. float. 210 | initialisation: What distribution to use. str in {normal, uniform}. Default 211 | normal. 212 | l1: L1 regularisation. Default 0.1. float. 213 | l2: L2 regularisation. Default 0.1. float. 214 | -> Reward vector with shape (N,). 215 | """ 216 | 217 | n_states, d_states = feature_matrix.shape 218 | transition_probability = th.shared(transition_probability, borrow=True) 219 | trajectories = th.shared(trajectories, borrow=True) 220 | 221 | # Initialise W matrices; b biases. 222 | n_layers = len(structure)-1 223 | weights = [] 224 | hist_w_grads = [] # For AdaGrad. 225 | biases = [] 226 | hist_b_grads = [] # For AdaGrad. 227 | for i in range(n_layers): 228 | # W 229 | shape = (structure[i+1], structure[i]) 230 | if initialisation == "normal": 231 | matrix = th.shared(rn.normal(size=shape), name="W", borrow=True) 232 | else: 233 | matrix = th.shared(rn.uniform(size=shape), name="W", borrow=True) 234 | weights.append(matrix) 235 | hist_w_grads.append(th.shared(np.zeros(shape), name="hdW", borrow=True)) 236 | 237 | # b 238 | shape = (structure[i+1], 1) 239 | if initialisation == "normal": 240 | matrix = th.shared(rn.normal(size=shape), name="b", borrow=True) 241 | else: 242 | matrix = th.shared(rn.uniform(size=shape), name="b", borrow=True) 243 | biases.append(matrix) 244 | hist_b_grads.append(th.shared(np.zeros(shape), name="hdb", borrow=True)) 245 | 246 | # Initialise α weight, β bias. 247 | if initialisation == "normal": 248 | α = th.shared(rn.normal(size=(1, structure[-1])), name="alpha", 249 | borrow=True) 250 | else: 251 | α = th.shared(rn.uniform(size=(1, structure[-1])), name="alpha", 252 | borrow=True) 253 | hist_α_grad = T.zeros(α.shape) # For AdaGrad. 254 | 255 | adagrad_epsilon = 1e-6 # AdaGrad numerical stability. 256 | 257 | #### Theano symbolic setup. #### 258 | 259 | # Symbolic input. 260 | s_feature_matrix = T.matrix("x") 261 | # Feature matrices. 262 | # All dimensions of the form (d_layer, n_states). 263 | φs = [s_feature_matrix.T] 264 | # Forward propagation. 265 | for W, b in zip(weights, biases): 266 | φ = T.nnet.sigmoid(th.compile.ops.Rebroadcast((0, False), (1, True))(b) 267 | + W.dot(φs[-1])) 268 | φs.append(φ) 269 | # φs[1] = φ1 etc. 270 | # Reward. 271 | r = α.dot(φs[-1]).reshape((n_states,)) 272 | # Engineering hack: z-score the reward. 273 | r = (r - r.mean())/r.std() 274 | # Associated feature expectations. 275 | expected_svf = find_expected_svf(n_states, r, 276 | n_actions, discount, 277 | transition_probability, 278 | trajectories) 279 | svf = maxent.find_svf(n_states, trajectories.get_value()) 280 | # Derivatives (backward propagation). 281 | updates = [] 282 | α_grad = φs[-1].dot(svf - expected_svf).T 283 | hist_α_grad += α_grad**2 284 | adj_α_grad = α_grad/(adagrad_epsilon + T.sqrt(hist_α_grad)) 285 | updates.append((α, α + adj_α_grad*learning_rate)) 286 | 287 | def grad_for_state(s, theta, svf_diff, r): 288 | """ 289 | Calculate the gradient with respect to theta for one state. 290 | """ 291 | 292 | regularisation = abs(theta).sum()*l1 + (theta**2).sum()*l2 293 | return svf_diff[s] * T.grad(r[s], theta) - regularisation, {} 294 | 295 | for i, W in enumerate(weights): 296 | w_grads, _ = th.scan(fn=grad_for_state, 297 | sequences=[T.arange(n_states)], 298 | non_sequences=[W, svf - expected_svf, r]) 299 | w_grad = w_grads.sum(axis=0) 300 | hist_w_grads[i] += w_grad**2 301 | adj_w_grad = w_grad/(adagrad_epsilon + T.sqrt(hist_w_grads[i])) 302 | updates.append((W, W + adj_w_grad*learning_rate)) 303 | for i, b in enumerate(biases): 304 | b_grads, _ = th.scan(fn=grad_for_state, 305 | sequences=[T.arange(n_states)], 306 | non_sequences=[b, svf - expected_svf, r]) 307 | b_grad = b_grads.sum(axis=0) 308 | hist_b_grads[i] += b_grad**2 309 | adj_b_grad = b_grad/(adagrad_epsilon + T.sqrt(hist_b_grads[i])) 310 | updates.append((b, b + adj_b_grad*learning_rate)) 311 | 312 | train = th.function([s_feature_matrix], updates=updates, outputs=r) 313 | run = th.function([s_feature_matrix], outputs=r) 314 | 315 | for e in range(epochs): 316 | reward = train(feature_matrix) 317 | 318 | return reward.reshape((n_states,)) 319 | -------------------------------------------------------------------------------- /hierarchicalrl/sdp_maxent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements Semi-MDP maximum entropy inverse reinforcement learning (Ziebart et al., 2008) 3 | 4 | Todor Davchev, 2017 5 | t.b.davchev@ed.ac.uk 6 | """ 7 | 8 | from itertools import product 9 | 10 | import numpy as np 11 | import numpy.random as rn 12 | 13 | import sdp_value_iteration as value_iteration 14 | 15 | def irl(options_states, features_matrix, o_feature_matrix, n_actions, n_options, discount, 16 | options_transition_probability, transition_probability, 17 | trajectories, global_trajectories, epochs, learning_rate, int_to_point, options): 18 | """ 19 | Find the reward function for the given trajectories. 20 | 21 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 22 | array with shape (N, D) where N is the number of states and D is the 23 | dimensionality of the state. 24 | n_actions: Number of actions A. int. 25 | discount: Discount factor of the MDP. float. 26 | transition_probability: NumPy array mapping (state_i, action, state_k) to 27 | the probability of transitioning from state_i to state_k under action. 28 | Shape (N, A, N). 29 | trajectories: 3D array of state/action pairs. States are ints, actions 30 | are ints. NumPy array with shape (T, L, 2) where T is the number of 31 | trajectories and L is the trajectory length. 32 | epochs: Number of gradient descent steps. int. 33 | learning_rate: Gradient descent learning rate. float. 34 | -> Reward vector with shape (N,). 35 | """ 36 | 37 | n_states = [np.asarray(i).shape[0] for i in features_matrix] 38 | d_states = [np.asarray(i).shape[1] for i in features_matrix] 39 | on_states = o_feature_matrix.shape[0] 40 | od_states = o_feature_matrix.shape[1] 41 | # n_states, d_states = [features_matrix[i].shape for i in xrange(features_matrix)] 42 | 43 | # Initialise weights. 44 | alpha = [rn.uniform(size=(d_st,)) for d_st in d_states] 45 | o_alpha = rn.uniform(size=(od_states,)) 46 | 47 | # option = 0 48 | # Calculate the feature expectations \tilde{phi}. 49 | 50 | # change the samples to go from option through option etc ... 51 | feature_expectations, options_feature_expectations = find_feature_expectations( 52 | features_matrix, o_feature_matrix, trajectories, global_trajectories, options_states) 53 | 54 | # Gradient descent on alpha. 55 | for i in range(epochs): 56 | # print("i: {}".format(i)) 57 | r = np.asarray([np.asarray(features_matrix[opt]).dot(alpha[opt]) for opt in range(n_options)]) 58 | r_o = o_feature_matrix.dot(o_alpha) 59 | expected_svf, options_expected_svf = find_expected_svf( 60 | options_states, on_states, n_states, r_o, r, 61 | n_actions, n_options, discount, options_transition_probability, 62 | transition_probability, trajectories, global_trajectories) 63 | #not for 0 only but for all options 64 | modif_expected_svf = [ 65 | [ 66 | [ 67 | item for idx, item in enumerate(expected_svf[opt]) if idx == opt_state] 68 | for opt_state in options_states[opt]] 69 | for opt in range(n_options)] 70 | grad = [feature_expectations[opt] - np.asarray(features_matrix[opt]).T.dot(modif_expected_svf[opt]).reshape((n_states[opt],)) for opt in range(n_options)] 71 | modif_opt_exp_svf = [[value for idx, value in enumerate(options_expected_svf) if int_to_point(idx) == opt["termination"]][0] for opt in options] 72 | o_grad = options_feature_expectations - o_feature_matrix.T.dot(modif_opt_exp_svf) 73 | 74 | alpha += [learning_rate * grad[opt] for opt in range(n_options)] 75 | o_alpha += learning_rate * o_grad 76 | 77 | return [np.asarray(features_matrix[opt]).dot(alpha[opt]).reshape((n_states[opt],)) for opt in range(n_options)],\ 78 | o_feature_matrix.dot(o_alpha).reshape((n_options,)) 79 | 80 | def find_svf(n_states, trajectories): 81 | """ 82 | Find the state visitation frequency from trajectories. 83 | 84 | n_states: Number of states. int. 85 | trajectories: 3D array of state/action pairs. States are ints, actions 86 | are ints. NumPy array with shape (T, L, 2) where T is the number of 87 | trajectories and L is the trajectory length. 88 | -> State visitation frequencies vector with shape (N,). 89 | """ 90 | 91 | svf = np.zeros(n_states) 92 | 93 | for trajectory in trajectories: 94 | for state, _, _ in trajectory: 95 | svf[state] += 1 96 | 97 | svf /= trajectories.shape[0] 98 | 99 | return svf 100 | 101 | def find_feature_expectations(feature_matrix, o_feature_matrix, trajectories, global_trajectories, options_states): 102 | """ 103 | Find the feature expectations for the given trajectories. This is the 104 | average path feature vector. 105 | 106 | feature_matrix: Matrix with the nth row representing the nth state. NumPy 107 | array with shape (N, D) where N is the number of states and D is the 108 | dimensionality of the state. 109 | trajectories: 3D array of state/action pairs. States are ints, actions 110 | are ints. NumPy array with shape (T, L, 2) where T is the number of 111 | trajectories and L is the trajectory length. 112 | -> Feature expectations vector with shape (D,). 113 | """ 114 | option_feature_expectations = np.zeros(len(o_feature_matrix)) # kolko optioni ima i v koi e bil nai-mnogo 115 | feature_expectations = [np.zeros(len(feature_matrix[i])) for i in xrange(len(options_states))] 116 | for br, option_states in enumerate(options_states): 117 | for trajectory in trajectories[br]: 118 | # for state, _, _ in trajectory: 119 | for traj_id in trajectory: 120 | feature_expectations[br] += feature_matrix[br][ 121 | [idx for idx, state in enumerate(option_states) if state == traj_id[0]][0]] 122 | 123 | feature_expectations[br] /= trajectories[br].shape[0] 124 | 125 | for global_traj in global_trajectories: 126 | for option_used in global_traj: 127 | option_feature_expectations += o_feature_matrix[option_used[1]] 128 | 129 | option_feature_expectations /= global_trajectories.shape[0] 130 | 131 | return feature_expectations, option_feature_expectations 132 | 133 | def find_expected_svf(options_states, on_states, n_states, r_o, r, n_actions, n_options, discount, 134 | options_transition_probability, transition_probability, trajectories, 135 | global_trajectories): 136 | """ 137 | Find the expected state visitation frequencies using algorithm 1 from 138 | Ziebart et al. 2008. 139 | 140 | n_states: Number of states N. int. 141 | alpha: Reward. NumPy array with shape (N,). 142 | n_actions: Number of actions A. int. 143 | discount: Discount factor of the MDP. float. 144 | transition_probability: NumPy array mapping (state_i, action, state_k) to 145 | the probability of transitioning from state_i to state_k under action. 146 | Shape (N, A, N). 147 | trajectories: 3D array of state/action pairs. States are ints, actions 148 | are ints. NumPy array with shape (T, L, 2) where T is the number of 149 | trajectories and L is the trajectory length. 150 | -> Expected state visitation frequencies vector with shape (N,). 151 | """ 152 | 153 | n_trajectories = trajectories[0].shape[0] 154 | trajectory_lengths = trajectories[0].shape[1] 155 | 156 | # policy = find_policy(n_states, r, n_actions, discount, 157 | # transition_probability) 158 | 159 | # policy = [[] for _ in range(len(options_states))] 160 | policy = value_iteration.find_policy(options_states, n_states, n_actions, n_options, 161 | options_transition_probability, 162 | transition_probability, r_o, r, discount) 163 | 164 | options_policy = value_iteration.find_option_policy( 165 | options_states, n_states, n_actions, n_options, options_transition_probability, 166 | transition_probability, r_o, r, discount) 167 | 168 | # option-to-option 169 | opt_start_state_count = np.zeros(121) 170 | global_trajectory_length = 0 171 | length = 0 172 | for gl_trajectory in global_trajectories: 173 | count = 0 174 | opt_start_state_count[gl_trajectory[0][0][0]] += 1 175 | for trajectory in gl_trajectory: 176 | count += len(trajectories[0]) 177 | 178 | if count > length: 179 | length = count 180 | 181 | op_start_state = opt_start_state_count/n_trajectories 182 | 183 | opt_expected_svf = np.tile(op_start_state, (length, 1)).T 184 | for t in range(1, length): 185 | opt_expected_svf[:, t] = 0 186 | for i, j, k in product(range(121), range(n_options), range(121)): 187 | opt_expected_svf[k, t] += (opt_expected_svf[i, t-1] * 188 | options_policy[i, j] * # Stochastic policy 189 | options_transition_probability[i, j, k]) 190 | 191 | options_result = opt_expected_svf.sum(axis=1) 192 | 193 | # intra-options 194 | start_state_count = np.zeros((8, 121)) 195 | p_start_state = [] 196 | for option in range(n_options): 197 | for trajectory in trajectories[option]: 198 | start_state_count[option][trajectory[0, 0]] += 1 199 | p_start_state.append(start_state_count[option]/n_trajectories) 200 | result = [] 201 | expected_svf = [np.tile(p_start_state[opt], (trajectory_lengths, 1)).T for opt in range(len(options_states))] 202 | ids = [[ 203 | 56, 45, 44, 46, 47, 48, 204 | 33, 34, 35, 36, 37, 205 | 22, 23, 24, 25, 26, 27, 206 | 11, 12, 13, 14, 15, 207 | 0, 1, 2, 3, 4 208 | ], 209 | [ 210 | 27, 26, 15, 4, 37, 48, 211 | 3, 14, 25, 36, 47, 212 | 2, 13, 24, 35, 46, 213 | 1, 12, 23, 34, 45, 214 | 0, 11, 22, 33, 44, 56 215 | ], 216 | [ 217 | 27, 28, 17, 39, 17, 6, 39, 50, 61, 218 | 62, 51, 40, 29, 18, 7, 219 | 8, 19, 30, 41, 52, 63, 74, 220 | 9, 20, 31, 42, 53, 64, 221 | 10, 21, 32, 43, 54, 65 222 | ], 223 | [ 224 | 74, 63, 62, 61, 64, 65, 225 | 50, 51, 52, 53, 54, 226 | 39, 40, 41, 42, 43, 227 | 32, 31, 30, 29, 28, 27, 228 | 17, 18, 19, 20, 21, 6, 7, 8, 9, 10 229 | ], 230 | [ 231 | 74, 85, 84, 83, 86, 87, 232 | 94, 95, 96, 97, 98, 233 | 109, 108, 107, 106, 105, 104, 234 | 116, 117, 118, 119, 120 235 | ], 236 | [ 237 | 104, 105, 116, 94, 83, 238 | 84, 95, 106, 117, 239 | 118, 107, 96, 85, 74, 240 | 86, 97, 108, 119, 241 | 87, 98, 109, 120 242 | ], 243 | [ 244 | 104, 103, 114, 92, 81, 70, 245 | 69, 80, 91, 102, 113, 246 | 68, 79, 90, 101, 112, 247 | 67, 56, 78, 89, 100, 111, 248 | 66, 77, 88, 99, 110 249 | ], 250 | [ 251 | 56, 67, 66, 68, 69, 70, 252 | 77, 78, 79, 80, 81, 253 | 88, 89, 90, 91, 92, 254 | 99, 100, 101, 102, 103, 104, 255 | 110, 111, 112, 113, 114 256 | ]] 257 | for o in range(len(ids)): 258 | for t in range(1, trajectory_lengths): 259 | expected_svf[o][:, t] = 0 260 | for i, j, k in product(ids[o], range(n_actions),ids[o]): 261 | if i in options_states[o]: 262 | idme = [idx for idx, state in enumerate(options_states[o]) if state == i][0] 263 | # Stochastic policy 264 | expected_svf[o][k, t] += (expected_svf[o][i, t-1] * policy[o][idme, j] * 265 | transition_probability[o][i, j, k]) 266 | else: 267 | expected_svf[o][k, t] = 0 268 | 269 | result.append(expected_svf[o].sum(axis=1)) 270 | 271 | return result, options_result 272 | 273 | def softmax(x1, x2): 274 | """ 275 | Soft-maximum calculation, from algorithm 9.2 in Ziebart's PhD thesis. 276 | 277 | x1: float. 278 | x2: float. 279 | -> softmax(x1, x2) 280 | """ 281 | 282 | max_x = max(x1, x2) 283 | min_x = min(x1, x2) 284 | return max_x + np.log(1 + np.exp(min_x - max_x)) 285 | 286 | def find_policy(n_states, r, n_actions, discount, 287 | transition_probability): 288 | """ 289 | Find a policy with linear value iteration. Based on the code accompanying 290 | the Levine et al. GPIRL paper and on Ziebart's PhD thesis (algorithm 9.1). 291 | 292 | n_states: Number of states N. int. 293 | r: Reward. NumPy array with shape (N,). 294 | n_actions: Number of actions A. int. 295 | discount: Discount factor of the MDP. float. 296 | transition_probability: NumPy array mapping (state_i, action, state_k) to 297 | the probability of transitioning from state_i to state_k under action. 298 | Shape (N, A, N). 299 | -> NumPy array of states and the probability of taking each action in that 300 | state, with shape (N, A). 301 | """ 302 | 303 | # V = value_iteration.value(n_states, transition_probability, r, discount) 304 | 305 | # NumPy's dot really dislikes using inf, so I'm making everything finite 306 | # using nan_to_num. 307 | V = np.nan_to_num(np.ones((n_states, 1)) * float("-inf")) 308 | 309 | diff = np.ones((n_states,)) 310 | while (diff > 1e-4).all(): # Iterate until convergence. 311 | new_V = r.copy() 312 | for j in range(n_actions): 313 | for i in range(n_states): 314 | new_V[i] = softmax(new_V[i], r[i] + discount* 315 | np.sum(transition_probability[i, j, k] * V[k] 316 | for k in range(n_states))) 317 | 318 | # # This seems to diverge, so we z-score it (engineering hack). 319 | new_V = (new_V - new_V.mean())/new_V.std() 320 | 321 | diff = abs(V - new_V) 322 | V = new_V 323 | 324 | # We really want Q, not V, so grab that using equation 9.2 from the thesis. 325 | Q = np.zeros((n_states, n_actions)) 326 | for i in range(n_states): 327 | for j in range(n_actions): 328 | p = np.array([transition_probability[i, j, k] 329 | for k in range(n_states)]) 330 | Q[i, j] = p.dot(r + discount*V) 331 | 332 | # Softmax by row to interpret these values as probabilities. 333 | Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability. 334 | Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1)) 335 | return Q 336 | 337 | def expected_value_difference(n_states, n_actions, transition_probability, 338 | reward, discount, p_start_state, optimal_value, true_reward): 339 | """ 340 | Calculate the expected value difference, which is a proxy to how good a 341 | recovered reward function is. 342 | 343 | n_states: Number of states. int. 344 | n_actions: Number of actions. int. 345 | transition_probability: NumPy array mapping (state_i, action, state_k) to 346 | the probability of transitioning from state_i to state_k under action. 347 | Shape (N, A, N). 348 | reward: Reward vector mapping state int to reward. Shape (N,). 349 | discount: Discount factor. float. 350 | p_start_state: Probability vector with the ith component as the probability 351 | that the ith state is the start state. Shape (N,). 352 | optimal_value: Value vector for the ground reward with optimal policy. 353 | The ith component is the value of the ith state. Shape (N,). 354 | true_reward: True reward vector. Shape (N,). 355 | -> Expected value difference. float. 356 | """ 357 | 358 | policy = value_iteration.find_policy(n_states, n_actions, 359 | transition_probability, reward, discount) 360 | value = value_iteration.value(policy.argmax(axis=1), n_states, 361 | transition_probability, true_reward, discount) 362 | 363 | evd = optimal_value.dot(p_start_state) - value.dot(p_start_state) 364 | return evd 365 | -------------------------------------------------------------------------------- /examples/experiments.py: -------------------------------------------------------------------------------- 1 | """ 2 | Perform the experiments from the report. 3 | 4 | Matthew Alger, 2015 5 | matthew.alger@anu.edu.au 6 | """ 7 | 8 | from time import time 9 | from sys import stdout 10 | 11 | import sys 12 | sys.path.append("/home/todor/Documents/workspace/smdp") 13 | 14 | import numpy as np 15 | import matplotlib.pyplot as plt 16 | 17 | from irl import maxent 18 | from irl import deep_maxent 19 | from irl import value_iteration 20 | from irl.mdp.gridworld import Gridworld 21 | from irl.mdp.objectworld import Objectworld 22 | 23 | def test_gw_once(grid_size, feature_map, n_samples, epochs, structure): 24 | """ 25 | Test MaxEnt and DeepMaxEnt on a gw of size grid_size with the feature 26 | map feature_map with n_samples paths. 27 | 28 | grid_size: Grid size. int. 29 | feature_map: Which feature map to use. String in {ident, coord, proxi}. 30 | n_samples: Number of paths to sample. 31 | epochs: Number of epochs to run MaxEnt with. 32 | structure: Neural network structure tuple, e.g. (3, 3) would be a 33 | 3-layer neural network with assumed inputs. 34 | -> Expected value difference for MaxEnt, DeepMaxEnt 35 | """ 36 | 37 | # Basic gist of what we're doing here: Get the reward function using our 38 | # different IRL methods, use those to get a policy, evaluate that policy 39 | # using the true reward, and then return the difference in expected values. 40 | 41 | # Setup parameters. 42 | wind = 0.3 43 | discount = 0.9 44 | learning_rate = 0.01 45 | trajectory_length = 3*grid_size 46 | 47 | # Make the gridworld and associated data. 48 | gw = Gridworld(grid_size, wind, discount) 49 | feature_matrix = gw.feature_matrix(feature_map) 50 | ground_reward = np.array([gw.reward(i) for i in range(gw.n_states)]) 51 | optimal_policy = value_iteration.find_policy(gw.n_states, 52 | gw.n_actions, 53 | gw.transition_probability, 54 | ground_reward, 55 | discount).argmax(axis=1) 56 | trajectories = gw.generate_trajectories(n_samples, 57 | trajectory_length, 58 | optimal_policy.take) 59 | p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) / 60 | trajectories.shape[0]) 61 | 62 | # True value. 63 | optimal_V = value_iteration.optimal_value(gw.n_states, 64 | gw.n_actions, 65 | gw.transition_probability, 66 | ground_reward, gw.discount) 67 | 68 | # MaxEnt reward; policy; value. 69 | maxent_reward = deep_maxent.irl((feature_matrix.shape[1],), 70 | feature_matrix, 71 | gw.n_actions, 72 | gw.discount, 73 | gw.transition_probability, 74 | trajectories, epochs, learning_rate) 75 | 76 | maxent_policy = value_iteration.find_policy(gw.n_states, 77 | gw.n_actions, 78 | gw.transition_probability, 79 | maxent_reward, 80 | discount).argmax(axis=1) 81 | maxent_V = value_iteration.value(maxent_policy, 82 | gw.n_states, 83 | gw.transition_probability, 84 | ground_reward, 85 | gw.discount) 86 | maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state) 87 | 88 | # DeepMaxEnt reward; policy; value. 89 | deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure, 90 | feature_matrix, 91 | gw.n_actions, 92 | gw.discount, 93 | gw.transition_probability, 94 | trajectories, epochs, learning_rate) 95 | deep_maxent_policy = value_iteration.find_policy(gw.n_states, 96 | gw.n_actions, 97 | gw.transition_probability, 98 | deep_maxent_reward, 99 | discount).argmax(axis=1) 100 | deep_maxent_V = value_iteration.value(deep_maxent_policy, 101 | gw.n_states, 102 | gw.transition_probability, 103 | ground_reward, 104 | gw.discount) 105 | deep_maxent_EVD = (optimal_V.dot(p_start_state) - 106 | deep_maxent_V.dot(p_start_state)) 107 | 108 | plt.subplot(3, 3, 1) 109 | plt.pcolor(ground_reward.reshape((grid_size, grid_size))) 110 | plt.title("Groundtruth reward") 111 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 112 | bottom=False, top=False, left=False, right=False, 113 | labelright=False) 114 | plt.subplot(3, 3, 2) 115 | plt.pcolor(maxent_reward.reshape((grid_size, grid_size))) 116 | plt.title("MaxEnt reward") 117 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 118 | bottom=False, top=False, left=False, right=False, 119 | labelright=False) 120 | plt.subplot(3, 3, 3) 121 | plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size))) 122 | plt.title("DeepMaxEnt reward") 123 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 124 | bottom=False, top=False, left=False, right=False, 125 | labelright=False) 126 | 127 | plt.subplot(3, 3, 4) 128 | plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) 129 | plt.title("Optimal policy") 130 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 131 | bottom=False, top=False, left=False, right=False, 132 | labelright=False) 133 | plt.subplot(3, 3, 5) 134 | plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) 135 | plt.title("MaxEnt policy") 136 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 137 | bottom=False, top=False, left=False, right=False, 138 | labelright=False) 139 | plt.subplot(3, 3, 6) 140 | plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)), 141 | vmin=0, vmax=3) 142 | plt.title("DeepMaxEnt policy") 143 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 144 | bottom=False, top=False, left=False, right=False, 145 | labelright=False) 146 | 147 | plt.subplot(3, 3, 7) 148 | plt.pcolor(optimal_V.reshape((grid_size, grid_size))) 149 | plt.title("Optimal value") 150 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 151 | bottom=False, top=False, left=False, right=False, 152 | labelright=False) 153 | plt.subplot(3, 3, 8) 154 | plt.pcolor(maxent_V.reshape((grid_size, grid_size))) 155 | plt.title("MaxEnt value") 156 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 157 | bottom=False, top=False, left=False, right=False, 158 | labelright=False) 159 | plt.subplot(3, 3, 9) 160 | plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size))) 161 | plt.title("DeepMaxEnt value") 162 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 163 | bottom=False, top=False, left=False, right=False, 164 | labelright=False) 165 | plt.savefig("{}_{}_{}_{}gridworld{}.png".format(grid_size, feature_map, 166 | n_samples, epochs, structure, np.random.randint(10000000))) 167 | 168 | 169 | return maxent_EVD, deep_maxent_EVD 170 | 171 | def test_ow_once(grid_size, n_objects, n_colours, discrete, l1, l2, n_samples, 172 | epochs, structure): 173 | """ 174 | Test MaxEnt and DeepMaxEnt on a ow of size grid_size with the feature 175 | map feature_map with n_samples paths. 176 | 177 | grid_size: Grid size. int. 178 | n_objects: Number of objects. int. 179 | n_colours: Number of colours. int. 180 | discrete: Whether the features should be discrete. bool. 181 | l1: L1 regularisation. float. 182 | l2: L2 regularisation. float. 183 | n_samples: Number of paths to sample. 184 | epochs: Number of epochs to run MaxEnt with. 185 | structure: Neural network structure tuple, e.g. (3, 3) would be a 186 | 3-layer neural network with assumed inputs. 187 | -> Expected value difference for MaxEnt, DeepMaxEnt 188 | """ 189 | 190 | # Basic gist of what we're doing here: Get the reward function using our 191 | # different IRL methods, use those to get a policy, evaluate that policy 192 | # using the true reward, and then return the difference in expected values. 193 | 194 | # Setup parameters. 195 | wind = 0.3 196 | discount = 0.9 197 | learning_rate = 0.01 198 | trajectory_length = 3*grid_size 199 | 200 | # Make the objectworld and associated data. 201 | ow = Objectworld(grid_size, n_objects, n_colours, wind, discount) 202 | feature_matrix = ow.feature_matrix(discrete) 203 | ground_reward = np.array([ow.reward(i) for i in range(ow.n_states)]) 204 | optimal_policy = value_iteration.find_policy(ow.n_states, 205 | ow.n_actions, 206 | ow.transition_probability, 207 | ground_reward, 208 | discount).argmax(axis=1) 209 | trajectories = ow.generate_trajectories(n_samples, 210 | trajectory_length, 211 | optimal_policy.take) 212 | p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) / 213 | trajectories.shape[0]) 214 | 215 | # True value. 216 | optimal_V = value_iteration.optimal_value(ow.n_states, 217 | ow.n_actions, 218 | ow.transition_probability, 219 | ground_reward, ow.discount) 220 | 221 | # MaxEnt reward; policy; value. 222 | maxent_reward = deep_maxent.irl((feature_matrix.shape[1],), 223 | feature_matrix, 224 | ow.n_actions, 225 | ow.discount, 226 | ow.transition_probability, 227 | trajectories, epochs, learning_rate, 228 | l1=l1, l2=l2) 229 | 230 | maxent_policy = value_iteration.find_policy(ow.n_states, 231 | ow.n_actions, 232 | ow.transition_probability, 233 | maxent_reward, 234 | discount).argmax(axis=1) 235 | maxent_V = value_iteration.value(maxent_policy, 236 | ow.n_states, 237 | ow.transition_probability, 238 | ground_reward, 239 | ow.discount) 240 | maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state) 241 | 242 | # DeepMaxEnt reward; policy; value. 243 | deep_learning_rate = 0.005 # For the 32 x 32 experiments. 244 | deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure, 245 | feature_matrix, 246 | ow.n_actions, 247 | ow.discount, 248 | ow.transition_probability, 249 | trajectories, epochs, 250 | deep_learning_rate, 251 | l1=l1, l2=l2) 252 | 253 | deep_maxent_policy = value_iteration.find_policy(ow.n_states, 254 | ow.n_actions, 255 | ow.transition_probability, 256 | deep_maxent_reward, 257 | discount).argmax(axis=1) 258 | deep_maxent_V = value_iteration.value(deep_maxent_policy, 259 | ow.n_states, 260 | ow.transition_probability, 261 | ground_reward, 262 | ow.discount) 263 | 264 | deep_maxent_EVD = (optimal_V.dot(p_start_state) - 265 | deep_maxent_V.dot(p_start_state)) 266 | 267 | plt.subplot(3, 3, 1) 268 | plt.pcolor(ground_reward.reshape((grid_size, grid_size))) 269 | plt.title("Groundtruth reward") 270 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 271 | bottom=False, top=False, left=False, right=False, labelright=False) 272 | plt.subplot(3, 3, 2) 273 | plt.pcolor(maxent_reward.reshape((grid_size, grid_size))) 274 | plt.title("MaxEnt reward") 275 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 276 | bottom=False, top=False, left=False, right=False, labelright=False) 277 | plt.subplot(3, 3, 3) 278 | plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size))) 279 | plt.title("DeepMaxEnt reward") 280 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 281 | bottom=False, top=False, left=False, right=False, labelright=False) 282 | 283 | plt.subplot(3, 3, 4) 284 | plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) 285 | plt.title("Optimal policy") 286 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 287 | bottom=False, top=False, left=False, right=False, labelright=False) 288 | plt.subplot(3, 3, 5) 289 | plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) 290 | plt.title("MaxEnt policy") 291 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 292 | bottom=False, top=False, left=False, right=False, labelright=False) 293 | plt.subplot(3, 3, 6) 294 | plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)), 295 | vmin=0, vmax=3) 296 | plt.title("DeepMaxEnt policy") 297 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 298 | bottom=False, top=False, left=False, right=False, labelright=False) 299 | 300 | plt.subplot(3, 3, 7) 301 | plt.pcolor(optimal_V.reshape((grid_size, grid_size))) 302 | plt.title("Optimal value") 303 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 304 | bottom=False, top=False, left=False, right=False, labelright=False) 305 | plt.subplot(3, 3, 8) 306 | plt.pcolor(maxent_V.reshape((grid_size, grid_size))) 307 | plt.title("MaxEnt value") 308 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 309 | bottom=False, top=False, left=False, right=False, labelright=False) 310 | plt.subplot(3, 3, 9) 311 | plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size))) 312 | plt.title("DeepMaxEnt value") 313 | plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, 314 | bottom=False, top=False, left=False, right=False, labelright=False) 315 | plt.savefig("{}_{}_{}_{}_{}_{}_{}_{}_{}_objectworld_{}.png".format( 316 | grid_size, n_objects, n_colours, discrete, n_samples, epochs, structure, 317 | l1, l2, np.random.randint(10000000))) 318 | 319 | return maxent_EVD, deep_maxent_EVD 320 | 321 | def test_gw_over_samples(grid_size, feature_map, epochs, structure, n): 322 | """ 323 | Test MaxEnt and DeepMaxEnt on a gridworld of size grid_size with the feature 324 | map feature_map with different numbers of paths. 325 | 326 | grid_size: Grid size. int. 327 | feature_map: Which feature map to use. String in {ident, coord, proxi}. 328 | epochs: MaxEnt iterations. int. 329 | structure: Neural network structure tuple, e.g. (3, 3) would be a 330 | 3-layer neural network with assumed inputs. 331 | n: Iterations. int. 332 | -> (MaxEnt [(n_samples, mean expected value difference, stdev)], 333 | DeepMaxEnt [(n_samples, mean expected value difference, stdev)]), 334 | raw data (maxent_data, deep_maxent_data) 335 | """ 336 | 337 | maxent_data = [] 338 | deep_maxent_data = [] 339 | for n_samples in (32,): 340 | t = time() 341 | maxent_EVDs = [] 342 | deep_maxent_EVDs = [] 343 | for i in range(n): 344 | print("{}: {}/{}".format(n_samples, i+1, n)) 345 | maxent_EVD, deep_maxent_EVD = test_gw_once(grid_size, feature_map, 346 | n_samples, epochs, 347 | structure) 348 | maxent_EVDs.append(maxent_EVD) 349 | deep_maxent_EVDs.append(deep_maxent_EVD) 350 | print(maxent_EVD, deep_maxent_EVD) 351 | stdout.flush() 352 | maxent_data.append((n_samples, np.mean(maxent_EVDs), 353 | np.std(maxent_EVDs))) 354 | deep_maxent_data.append((n_samples, np.mean(deep_maxent_EVDs), 355 | np.std(deep_maxent_EVDs))) 356 | print("{} (took {:.02}s)".format(n_samples, time() - t)) 357 | print("MaxEnt:", maxent_data) 358 | print("DeepMaxEnt:", deep_maxent_data) 359 | return maxent_data, deep_maxent_data 360 | 361 | def test_ow_over_samples(grid_size, n_objects, n_colours, discrete, l1, l2, 362 | epochs, structure, n): 363 | """ 364 | Test MaxEnt and DeepMaxEnt on an objectworld with different numbers of paths. 365 | 366 | grid_size: Grid size. int. 367 | n_objects: Number of objects. int. 368 | n_colours: Number of colours. int. 369 | discrete: Whether the features should be discrete. bool. 370 | feature_map: Which feature map to use. String in {ident, coord, proxi}. 371 | l1: L1 regularisation. float. 372 | l2: L2 regularisation. float. 373 | epochs: MaxEnt iterations. int. 374 | structure: Neural network structure tuple, e.g. (3, 3) would be a 375 | 3-layer neural network with assumed inputs. 376 | n: Iterations. int. 377 | -> (MaxEnt [(n_samples, mean expected value difference, stdev)], 378 | DeepMaxEnt [(n_samples, mean expected value difference, stdev)]), 379 | raw data (maxent_data, deep_maxent_data) 380 | """ 381 | 382 | maxent_data = [] 383 | deep_maxent_data = [] 384 | for n_samples in (32, 16, 8, 4): 385 | t = time() 386 | maxent_EVDs = [] 387 | deep_maxent_EVDs = [] 388 | for i in range(n): 389 | print("{}: {}/{}".format(n_samples, i+1, n)) 390 | maxent_EVD, deep_maxent_EVD = test_ow_once(grid_size, n_objects, 391 | n_colours, discrete, l1, l2, n_samples, epochs, structure) 392 | maxent_EVDs.append(maxent_EVD) 393 | deep_maxent_EVDs.append(deep_maxent_EVD) 394 | print(maxent_EVD, deep_maxent_EVD) 395 | stdout.flush() 396 | maxent_data.append((n_samples, np.mean(maxent_EVDs), 397 | np.median(maxent_EVDs), np.std(maxent_EVDs))) 398 | deep_maxent_data.append((n_samples, np.mean(deep_maxent_EVDs), 399 | np.median(deep_maxent_EVDs), np.std(deep_maxent_EVDs))) 400 | print("{} (took {:.02}s)".format(n_samples, time() - t)) 401 | print("MaxEnt:", maxent_data) 402 | print("DeepMaxEnt:", deep_maxent_data) 403 | return maxent_data, deep_maxent_data 404 | 405 | if __name__ == '__main__': 406 | # Tests the 16 x 16 objectworld. 407 | print(test_ow_over_samples(16, 25, 2, False, 0, 0, 150, (3, 3), 10)) 408 | # Tests the 32 x 32 objectworld. 409 | print(test_ow_over_samples(32, 50, 2, False, 0, 0, 250, (3, 3), 5)) -------------------------------------------------------------------------------- /hierarchicalrl/options_grid_world.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements the options gridworld MDP. 3 | 4 | Todor Davchev, 2017 5 | t.b.davchev@ed.ac.uk 6 | """ 7 | 8 | import numpy as np 9 | import numpy.random as rn 10 | 11 | 12 | class Large_Gridworld(object): 13 | """ 14 | Gridworld MDP. 15 | """ 16 | 17 | def __init__(self, grid_size, walls, options, rooms, wind, discount): 18 | """ 19 | grid_size: Grid size. int. 20 | wind: Chance of moving randomly. float. 21 | discount: MDP discount. float. 22 | -> Gridworld 23 | """ 24 | self.count = 0 25 | self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1)) 26 | self.options = options 27 | self.rooms = rooms 28 | self.n_actions = len(self.actions) 29 | self.n_states = grid_size**2 30 | self.grid_size = grid_size 31 | self.wind = wind 32 | self.discount = discount 33 | self.walls = walls 34 | self.init_states = [(5, 2), (1, 5), (8, 6), (5, 9)] 35 | self.term_states = [(5, 2), (1, 5), (8, 6), (5, 9)] 36 | self.n_options = len(self.options) 37 | self.k_length = 10 38 | self.gamma = 0.9 39 | self.count = 0 40 | # Preconstruct the transition probability array. 41 | # self.transition_probability = np.array( 42 | # [[[self._transition_probability(i, j, k) 43 | # for k in range(self.n_states)] 44 | # for j in range(self.n_actions)] 45 | # for i in range(self.n_states)]) 46 | 47 | # mislq che e greshno 48 | # self.reward_o = np.array( 49 | # [self._reward_o(option) 50 | # for option in self.options[:2]] 51 | # ) 52 | 53 | # Preconstruct the transition probability array. 54 | self.improved_transition_probability = np.array( 55 | [[[[self._improved_transition_probability(o, i, j, k) 56 | for k in range(self.n_states)] 57 | for j in range(self.n_actions)] 58 | for i in range(self.n_states)] 59 | for o in range(self.n_options)]) 60 | # for o in range(self.n_options)]) 61 | 62 | # Preconstruct the transition probability array. 63 | # after done, initial states are all within a room 64 | # should factor in sudden change of option ? - yes 65 | # self.options_transition_probability = np.array( 66 | # [[[self._options_transition_probability(i, j, k) 67 | # for k in self.init_states] 68 | # for j in range(self.n_options)] 69 | # for i in self.init_states]) 70 | states_per_option = [[state for state in self.rooms[opt["room"]]] for opt in self.options] 71 | self.options_transition_probability = np.zeros((121, 8, 121)) 72 | for opt in self.options: 73 | print opt["id"] 74 | if opt["id"] == 4: 75 | print "pochna se" 76 | za = [ 77 | [ 78 | self._options_transition_probability(i, opt["id"], k) 79 | for k in states_per_option[opt["id"]] 80 | ] 81 | for i in states_per_option[opt["id"]] 82 | ] 83 | print "compiuted transition prob" 84 | for br, state in enumerate(states_per_option[opt["id"]]): 85 | for br_2, state_k in enumerate(states_per_option[opt["id"]]): 86 | self.options_transition_probability[state][opt["id"]][state_k] = za[br][br_2] 87 | 88 | print "izlizam ot option.." 89 | print "Done." 90 | 91 | # self.options_transition_probability = np.array( 92 | # [[[self._options_transition_probability(i, o, k) 93 | # for k in states_per_option]#[term for term in self.term_states]] 94 | # for o in range(self.n_options)] 95 | # for i in states_per_option]) 96 | 97 | def __str__(self): 98 | return "Gridworld({}, {}, {})".format(self.grid_size, self.wind, 99 | self.discount) 100 | 101 | # def _reward_o(self, option): 102 | # option_states = self.rooms[option["room"]] 103 | # reward_o = [0 for _ in option_states] 104 | # for idx, state in enumerate(option_states): 105 | # k_step = 1 106 | # reward_o[idx] = self.reward(state) 107 | # sx, sy = self.int_to_point(state) 108 | # # current_state = (sx, sy) 109 | # while k_step < self.k_length and (sx, sy) != option["termination"]: 110 | # action = self.actions[self.intra_option_optimal_policy(self.point_to_int((sx, sy)), option["id"])] 111 | 112 | # if ((((sx + action[0], 113 | # sy + action[1]) == (5, 2) and option["id"] == 0) 114 | # or (sx + action[0], 115 | # sy + action[1]) == (1, 5) and option["id"] == 1) or 116 | # (0 <= sx + action[0] < 5 and#self.grid_size and 117 | # 0 <= sy + action[1] < 5)):#self.grid_size): 118 | # sx = sx + action[0] 119 | # sy = sy + action[1] 120 | 121 | # reward_o[idx] += self.reward(self.point_to_int((sx, sy))) * np.power(self.gamma, k_step) 122 | # k_step += 1 123 | # return reward_o 124 | 125 | def feature_vector(self, i, vec_size, feature_map="ident"): 126 | """ 127 | Get the feature vector associated with a state integer. 128 | 129 | i: State int. 130 | feature_map: Which feature map to use (default ident). String in {ident, 131 | coord, proxi}. 132 | -> Feature vector. 133 | """ 134 | 135 | if feature_map == "coord": 136 | f = np.zeros(self.grid_size) 137 | x, y = i % self.grid_size, i // self.grid_size 138 | f[x] += 1 139 | f[y] += 1 140 | return f 141 | if feature_map == "proxi": 142 | f = np.zeros(self.n_states) 143 | x, y = i % self.grid_size, i // self.grid_size 144 | for b in range(self.grid_size): 145 | for a in range(self.grid_size): 146 | dist = abs(x - a) + abs(y - b) 147 | f[self.point_to_int((a, b))] = dist 148 | return f 149 | # Assume identity map. 150 | f = np.zeros(vec_size) 151 | f[i] = 1 152 | return f 153 | 154 | def o_feature_matrix(self, feature_map="ident"): 155 | """ 156 | Get the feature matrix for this gridworld. 157 | 158 | feature_map: Which feature map to use (default ident). String in {ident, 159 | coord, proxi}. 160 | -> NumPy array with shape (n_states, d_states). 161 | """ 162 | features = [] 163 | for n in range(self.n_options): 164 | f = self.feature_vector(n, self.n_options, feature_map) 165 | features.append(f) 166 | return np.array(features) 167 | 168 | def feature_matrix(self, feature_map="ident"): 169 | """ 170 | Get the feature matrix for this gridworld. 171 | 172 | feature_map: Which feature map to use (default ident). String in {ident, 173 | coord, proxi}. 174 | -> NumPy array with shape (n_states, d_states). 175 | """ 176 | # [ 177 | # np.zeros(len(self.rooms[self.options[o]["room"]])) 178 | # for o in range(self.options)] 179 | features = [ 180 | [np.zeros(len(self.rooms[self.options[o]["room"]])) 181 | for __ in range(len(self.rooms[self.options[o]["room"]]))] 182 | for o in range(len(self.options))] 183 | # features = np.reshape(features, [len(self.options), self.n_states, self.n_states]) 184 | for o in range(len(self.options)): 185 | for n in range(len(features[o])): 186 | # if self.int_to_point(n) not in self.walls: # redundant 187 | # idx = [br for br, room in enumerate(self.rooms) if n in room][0] 188 | # f = self.feature_vector(n, feature_map) 189 | # features[idx][n] = f 190 | f = self.feature_vector(n, len(self.rooms[self.options[o]["room"]]), feature_map) 191 | features[o][n] = f 192 | return np.array(features) 193 | 194 | def opt_to_point(self, i): 195 | """ 196 | Convert an option int into the corresponding coordinate. 197 | 198 | i: option int. 199 | -> (x, y) int tuple. 200 | """ 201 | 202 | return self.options[i]["init_set"] 203 | 204 | def point_to_opt(self, p): 205 | """ 206 | Convert a coordinate into the corresponding state options list. 207 | 208 | p: (x, y) tuple. 209 | -> State int. 210 | """ 211 | 212 | return [x for x in self.options if x["init_set"] == p] 213 | 214 | def int_to_point(self, i): 215 | """ 216 | Convert a state int into the corresponding coordinate. 217 | 218 | i: State int. 219 | -> (x, y) int tuple. 220 | """ 221 | 222 | return (i % self.grid_size, i // self.grid_size) 223 | 224 | def point_to_int(self, p): 225 | """ 226 | Convert a coordinate into the corresponding state int. 227 | 228 | p: (x, y) tuple. 229 | -> State int. 230 | """ 231 | 232 | return p[0] + p[1] * self.grid_size 233 | 234 | def isa_wall(self, i): 235 | """ 236 | Get whether a point is a wall or not. Returns True if wall. 237 | 238 | i: (x, y) int tuple. 239 | -> bool. 240 | """ 241 | 242 | return i in self.walls 243 | 244 | def neighbouring_option_states(self, i, k): 245 | """ 246 | Get whether two options neighbour each other. Also returns true if they 247 | are the same options. 248 | 249 | i: (x, y) int tuple. 250 | k: (x, y) int tuple. 251 | -> bool. 252 | """ 253 | 254 | return len([x for x in self.options if x["termination"] == i and x["init_set"] == k]) > 0 255 | 256 | def neighbouring(self, i, k): 257 | """ 258 | Get whether two points neighbour each other. Also returns true if they 259 | are the same point. 260 | 261 | i: (x, y) int tuple. 262 | k: (x, y) int tuple. 263 | -> bool. 264 | """ 265 | 266 | return abs(i[0] - k[0]) + abs(i[1] - k[1]) <= 1 267 | 268 | def insame_room(self, i, k): 269 | """ 270 | Get whether two points are in the same room. Also returns true if they 271 | are the same point. 272 | 273 | i: int. 274 | k: int. 275 | -> [room id]. 276 | """ 277 | 278 | item_one = np.asarray([br for br, x in enumerate([i in room for room in self.rooms]) if x]) 279 | item_two = [br for br, x in enumerate([k in room for room in self.rooms]) if x] 280 | mask = np.in1d(item_one, item_two) 281 | 282 | return item_one[mask] 283 | 284 | # return [item for item in np.in1d(item_one, item_two) if item] 285 | 286 | # def _options_transition_probability(self, o, i, j, k): 287 | # """ 288 | # Get the probability of transitioning from state i to state k given 289 | # action j. 290 | 291 | # maybe start with option_state, option, option_state 292 | # if possible to get there, if the option state is the actual goal 293 | # assign 1 - wind, otherwise it should be 50% ? 294 | 295 | # i: Option State int. 296 | # j: Action int. 297 | # k: State int. 298 | # -> p(s_k | s_i, a_j) 299 | # """ 300 | # options_i = self.point_to_opt(i) 301 | # # option_id = [br for br, x in enumerate(self.init_states) if x == i][0] 302 | # option_action = self.options[j] 303 | # # option_kd = [br for br, x in enumerate(self.init_states) if x == k][0] 304 | # options_k = self.point_to_opt(k) 305 | 306 | # if i != option_action["init_set"]: 307 | # if i == k: 308 | # return 1.0 309 | # else: 310 | # return 0.0 311 | 312 | # if i == k: 313 | # return 0.0 314 | 315 | # if i == option_action["init_set"]: 316 | # if k == option_action["termination"]: 317 | # return 1 - self.wind 318 | 319 | # else: 320 | # s = [x for x in options_i if x["termination"] 321 | # != option_action["termination"]] 322 | # for option in s: 323 | # if option["termination"] == k: 324 | # return self.wind / len(s) 325 | 326 | # return 0.0 327 | 328 | def _options_transition_probability(self, i, o, k): 329 | """ 330 | Get the probability of transitioning from state i to state k given 331 | action j. 332 | 333 | maybe start with option_state, option, option_state 334 | if possible to get there, if the option state is the actual goal 335 | assign 1 - wind, otherwise it should be 50% ? 336 | 337 | i: Option State int. 338 | j: Action int. 339 | k: State int. 340 | -> p(s_k | s_i, a_j) 341 | """ 342 | 343 | xi, yi = self.int_to_point(i) 344 | xk, yk = self.int_to_point(k) 345 | 346 | if (xk, yk) != self.options[o]["termination"]: 347 | return 0 348 | 349 | room_no = np.asarray(self.insame_room(i, self.point_to_int((xk, yk)))) 350 | 351 | if self.options[o]["room"] not in room_no: 352 | return 0.0 353 | 354 | k_step = 0 355 | # current_state = (sx, sy) 356 | while (xi, yi) != self.options[o]["termination"]: 357 | action = self.actions[self.intra_option_optimal_policy( 358 | self.point_to_int((xi, yi)), self.options[o]["id"])] 359 | 360 | if (((xi + action[0], 361 | yi + action[1]) == self.options[o]["termination"]) or ( 362 | self.options[o]["min"][0] < xi + action[0] < self.options[o]["max"][0] and 363 | self.options[o]["min"][1] < yi + action[1] < self.options[o]["max"][1]) 364 | ): 365 | xi = xi + action[0] 366 | yi = yi + action[1] 367 | 368 | k_step += 1 369 | if k_step > 8: 370 | print "wtf" 371 | 372 | return np.power(self.gamma, k_step) 373 | 374 | def _improved_transition_probability(self, o, i, j, k): 375 | """ 376 | Get the probability of transitioning from state i to state k given 377 | action j. 378 | 379 | i: State int. 380 | j: Action int. 381 | k: State int. 382 | -> p(s_k | s_i, a_j) 383 | """ 384 | 385 | xi, yi = self.int_to_point(i) 386 | xj, yj = self.actions[j] 387 | xk, yk = self.int_to_point(k) 388 | 389 | room_no = np.asarray(self.insame_room(i, k)) 390 | 391 | if len(room_no) < 1: 392 | return 0.0 393 | 394 | if self.options[o]["room"] not in room_no: 395 | return 0.0 396 | 397 | if not self.neighbouring((xi, yi), (xk, yk)): 398 | return 0.0 399 | 400 | if self.isa_wall((xi, yi)): 401 | return 0.0 402 | 403 | if self.isa_wall((xk, yk)): 404 | return 0.0 405 | 406 | # Is k the intended state to move to? 407 | if (xi + xj, yi + yj) == (xk, yk): 408 | return 1 - self.wind + self.wind / self.n_actions 409 | 410 | # If these are not the same point, then we can move there by wind. 411 | if (xi, yi) != (xk, yk): 412 | return self.wind / self.n_actions 413 | 414 | # If these are the same point, we can only move here by either moving 415 | # off the grid or being blown off the grid. Are we on a corner or not? 416 | if (xi, yi) in {(0, 0), (self.grid_size - 1, self.grid_size - 1), 417 | (0, self.grid_size - 1), (self.grid_size - 1, 0), 418 | (4, 0), (6, 0), (6, 5), (4, 4), (0, 4), (0, 6), (4, 6), 419 | (10, 5), (6, 7), (10, 7), (4, 10), (6, 10)}: 420 | # Corner. 421 | # Can move off the edge in two directions. 422 | # Did we intend to move off the grid? 423 | if not ((0 <= xi + xj < self.grid_size and 424 | 0 <= yi + yj < self.grid_size) and 425 | not self.isa_wall((xi + xj, yi + yj))): 426 | # We intended to move off the grid, so we have the regular 427 | # success chance of staying here plus an extra chance of blowing 428 | # onto the *other* off-grid square. 429 | return 1 - self.wind + 2 * self.wind / self.n_actions 430 | else: 431 | # We can blow off the grid in either direction only by wind. 432 | return 2 * self.wind / self.n_actions 433 | elif (xi, yi) in {self.int_to_point(27), self.int_to_point(56), 434 | self.int_to_point(74), self.int_to_point(104)}: 435 | if not ((0 <= xi + xj < self.grid_size and 436 | 0 <= yi + yj < self.grid_size) and 437 | not self.isa_wall((xi + xj, yi + yj))): 438 | 439 | if (xi, yi) in self.init_states: 440 | return 1 - self.wind/self.n_actions 441 | # We intended to move off the grid, so we have the regular 442 | # success chance of staying here plus an extra chance of blowing 443 | # onto the *other* off-grid square. 444 | return 1 - self.wind + 2 * self.wind / self.n_actions 445 | else: 446 | if (xi, yi) in self.init_states: 447 | should_go = np.asarray(self.insame_room( 448 | self.point_to_int((xi, yi)), 449 | self.point_to_int((xi + xj, yi + yj)))) 450 | if len(should_go) > 0: 451 | if should_go[0] == o: 452 | return self.wind - self.wind / self.n_actions 453 | 454 | return 1 - self.wind/self.n_actions 455 | 456 | # We can blow off the grid in either direction only by wind. 457 | return 2 * self.wind / self.n_actions 458 | else: 459 | # Not a corner. Is it an edge? 460 | if (xi not in {0, self.grid_size - 1} and 461 | yi not in {0, self.grid_size - 1} and 462 | (xi, yi) not in { 463 | self.int_to_point(15), self.int_to_point( 464 | 37), self.int_to_point(17), 465 | self.int_to_point(39), self.int_to_point( 466 | 50), self.int_to_point(62), 467 | self.int_to_point(64), self.int_to_point( 468 | 86), self.int_to_point(84), 469 | self.int_to_point(94), self.int_to_point( 470 | 92), self.int_to_point(81), 471 | self.int_to_point(69), self.int_to_point( 472 | 68), self.int_to_point(46), 473 | self.int_to_point(47) 474 | } 475 | ): 476 | # Not an edge. 477 | return 0.0 478 | 479 | # Edge. 480 | # Can only move off the edge in one direction. 481 | # Did we intend to move off the grid? 482 | if not (0 <= xi + xj < self.grid_size and 483 | 0 <= yi + yj < self.grid_size and 484 | not self.isa_wall((xi + xj, yi + yj))): 485 | # We intended to move off the grid, so we have the regular 486 | # success chance of staying here. 487 | return 1 - self.wind + self.wind / self.n_actions 488 | else: 489 | # We can blow off the grid only by wind. 490 | return self.wind / self.n_actions 491 | 492 | def _transition_probability(self, i, j, k): 493 | """ 494 | Get the probability of transitioning from state i to state k given 495 | action j. 496 | 497 | i: State int. 498 | j: Action int. 499 | k: State int. 500 | -> p(s_k | s_i, a_j) 501 | """ 502 | 503 | xi, yi = self.int_to_point(i) 504 | xj, yj = self.actions[j] 505 | xk, yk = self.int_to_point(k) 506 | 507 | if not self.neighbouring((xi, yi), (xk, yk)): 508 | return 0.0 509 | 510 | if self.isa_wall((xi, yi)): 511 | return 0.0 512 | 513 | if self.isa_wall((xk, yk)): 514 | return 0.0 515 | 516 | # Is k the intended state to move to? 517 | if (xi + xj, yi + yj) == (xk, yk): 518 | return 1 - self.wind + self.wind / self.n_actions 519 | 520 | # If these are not the same point, then we can move there by wind. 521 | if (xi, yi) != (xk, yk): 522 | return self.wind / self.n_actions 523 | 524 | # If these are the same point, we can only move here by either moving 525 | # off the grid or being blown off the grid. Are we on a corner or not? 526 | if (xi, yi) in {(0, 0), (self.grid_size - 1, self.grid_size - 1), 527 | (0, self.grid_size - 1), (self.grid_size - 1, 0), 528 | (4, 0), (6, 0), (6, 5), (4, 4), (0, 4), (0, 6), (4, 6), 529 | (10, 5), (6, 7), (10, 7), (4, 10), (6, 10)}: 530 | # Corner. 531 | # Can move off the edge in two directions. 532 | # Did we intend to move off the grid? 533 | if not ((0 <= xi + xj < self.grid_size and 534 | 0 <= yi + yj < self.grid_size) and 535 | not self.isa_wall((xi + xj, yi + yj))): 536 | # We intended to move off the grid, so we have the regular 537 | # success chance of staying here plus an extra chance of blowing 538 | # onto the *other* off-grid square. 539 | return 1 - self.wind + 2 * self.wind / self.n_actions 540 | else: 541 | # We can blow off the grid in either direction only by wind. 542 | return 2 * self.wind / self.n_actions 543 | elif (xi, yi) in {self.int_to_point(27), self.int_to_point(56), 544 | self.int_to_point(74), self.int_to_point(104)}: 545 | if not ((0 <= xi + xj < self.grid_size and 546 | 0 <= yi + yj < self.grid_size) and 547 | not self.isa_wall((xi + xj, yi + yj))): 548 | # We intended to move off the grid, so we have the regular 549 | # success chance of staying here plus an extra chance of blowing 550 | # onto the *other* off-grid square. 551 | return 1 - self.wind + 2 * self.wind / self.n_actions 552 | else: 553 | # We can blow off the grid in either direction only by wind. 554 | return 2 * self.wind / self.n_actions 555 | else: 556 | # Not a corner. Is it an edge? 557 | if (xi not in {0, self.grid_size - 1} and 558 | yi not in {0, self.grid_size - 1} and 559 | (xi, yi) not in { 560 | self.int_to_point(15), self.int_to_point( 561 | 37), self.int_to_point(17), 562 | self.int_to_point(39), self.int_to_point( 563 | 50), self.int_to_point(62), 564 | self.int_to_point(64), self.int_to_point( 565 | 86), self.int_to_point(84), 566 | self.int_to_point(94), self.int_to_point( 567 | 92), self.int_to_point(81), 568 | self.int_to_point(69), self.int_to_point( 569 | 68), self.int_to_point(46), 570 | self.int_to_point(47) 571 | } 572 | ): 573 | # Not an edge. 574 | return 0.0 575 | 576 | # Edge. 577 | # Can only move off the edge in one direction. 578 | # Did we intend to move off the grid? 579 | if not (0 <= xi + xj < self.grid_size and 580 | 0 <= yi + yj < self.grid_size and 581 | not self.isa_wall((xi + xj, yi + yj))): 582 | # We intended to move off the grid, so we have the regular 583 | # success chance of staying here. 584 | return 1 - self.wind + self.wind / self.n_actions 585 | else: 586 | # We can blow off the grid only by wind. 587 | return self.wind / self.n_actions 588 | 589 | # def reward(self, state_int, option): 590 | # """ 591 | # Reward for being in state state_int. 592 | 593 | # state_int: State integer. int. 594 | # -> Reward. 595 | # """ 596 | # if option == 0 and state_int == 27:#self.n_states - 1: # self.point_to_int((8, 6)): 597 | # return 1 598 | # elif option == 1 and state_int == 56: 599 | # return 1 600 | # return 0 601 | 602 | def reward(self, state_int): 603 | """ 604 | Reward for being in state state_int. 605 | 606 | state_int: State integer. int. 607 | -> Reward. 608 | """ 609 | if state_int == 74: 610 | return 1 611 | 612 | return 0 613 | 614 | def opt_reward(self, opt_int): 615 | """ 616 | Reward for being in state state_int. 617 | 618 | state_int: State integer. int. 619 | -> Reward. 620 | """ 621 | if opt_int == 2: 622 | return 1 623 | 624 | return 0 625 | 626 | def average_reward(self, n_trajectories, trajectory_length, policy): 627 | """ 628 | Calculate the average total reward obtained by following a given policy 629 | over n_paths paths. 630 | 631 | policy: Map from state integers to action integers. 632 | n_trajectories: Number of trajectories. int. 633 | trajectory_length: Length of an episode. int. 634 | -> Average reward, standard deviation. 635 | """ 636 | 637 | trajectories = self.generate_trajectories(n_trajectories, 638 | trajectory_length, policy) 639 | rewards = [[r for _, _, r in trajectory] 640 | for trajectory in trajectories] 641 | rewards = np.array(rewards) 642 | 643 | # Add up all the rewards to find the total reward. 644 | total_reward = rewards.sum(axis=1) 645 | 646 | # Return the average reward and standard deviation. 647 | return total_reward.mean(), total_reward.std() 648 | 649 | def option_option_optimal_policy(self, option): 650 | if option == 0: 651 | return 2 652 | if option == 1: 653 | return 7 654 | if option == 2: 655 | return 2 656 | if option == 3: 657 | return 2 658 | if option == 4: 659 | return 5 660 | if option == 5: 661 | return 5 662 | if option == 6: 663 | return 7 664 | if option == 7: 665 | return 5 666 | 667 | def intra_option_optimal_policy(self, state_int, option): 668 | """ 669 | The optimal policy for this gridworld. 670 | 671 | state_int: What state we are in. int. 672 | Actions: {"right": 0, "down": 1, "left": 2, "up": 3} 673 | -> Action int. 674 | """ 675 | sx, sy = self.int_to_point(state_int) 676 | 677 | if option == 0: 678 | if (sx, sy) in [(4, 0), (4, 1), (3, 1)]: 679 | return 1 680 | if (sx, sy) in [(3, 4), (4, 4), (4, 3), (1, 5)]: 681 | return 3 682 | if (sx, sy) in [(0, 0), (1, 0), (2, 0), (3, 0), (0, 1), (1, 1), (2, 1), 683 | (0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, 2), 684 | (0, 3), (1, 3), (2, 3), (3, 3), (0, 4), (1, 4), (2, 4)]: 685 | return 0 686 | 687 | elif option == 1: 688 | if (sx, sy) in [(0, 3), (0, 4)]: 689 | return 0 690 | if (sx, sy) in [(5, 2), (4, 1), (3, 3), (4, 3), (2, 4), (3, 4), (4, 4)]: 691 | return 2 692 | if (sx, sy) in [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), 693 | (0, 1), (1, 1), (2, 1), (3, 1), 694 | (0, 2), (1, 2), (2, 2), (3, 2), (4, 2), 695 | (1, 3), (2, 3), (1, 4), (1, 5)]: 696 | return 1 697 | 698 | elif option == 2: 699 | if (sx, sy) in [(5, 2), (6, 2), (7, 2), (7, 3), (7, 4), (7, 5), (6, 5)]: 700 | return 0 701 | if (sx, sy) in [(9, 2), (10, 2), (10, 3), (10, 4), (10, 5), (9, 4), (9, 5)]: 702 | return 2 703 | if (sx, sy) in [(6, 0), (7, 0), (8, 0), (9, 0), (10, 0), 704 | (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (8, 2), 705 | (8, 3), (8, 4), (8, 5), (8, 6), (6, 3), (6, 4), (9, 3)]: 706 | return 1 707 | 708 | elif option == 3: 709 | if (sx, sy) in [(6, 5), (6, 4), (6, 3), 710 | (7, 5), (7, 4), (7, 3), 711 | (8, 5), (8, 4), (8, 3), (8, 6), 712 | (9, 5), (9, 4), (9, 3), 713 | (10, 5), (10, 4), (10, 3)]: 714 | return 3 715 | if (sx, sy) in [(5, 2), (6, 2), (7, 2), (8, 2), (9, 2), (10, 2)]: 716 | return 2 717 | if (sx, sy) in [(6, 0), (7, 0), (8, 0), (9, 0), (10, 0), 718 | (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]: 719 | return 1 720 | 721 | elif option == 4: 722 | if (sx, sy) in [(6, 7), (7, 7), (8, 7), (9, 7), (10, 7), 723 | (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (8, 6)]: 724 | return 1 725 | if (sx, sy) in [(5, 9), (6, 9), (7, 9), (8, 9), (9, 9), (10, 9)]: 726 | return 2 727 | if (sx, sy) in [(6, 10), (7, 10), (8, 10), (9, 10), (10, 10)]: 728 | return 3 729 | 730 | elif option == 5: 731 | if (sx, sy) in [(6, 7), (7, 7), 732 | (6, 8), (7, 8), (6, 9), (5, 9), (7, 9), (6, 10), (7, 10)]: 733 | return 0 734 | if (sx, sy) in [(9, 9), (10, 9), (9, 10), (10, 10), (9, 8), (10, 8), (9, 7), (10, 7)]: 735 | return 2 736 | if (sx, sy) in [(8, 10), (8, 9), (8, 8), (8, 7), (8, 6)]: 737 | return 3 738 | 739 | elif option == 6: 740 | if (sx, sy) in [(0, 6), (0, 7), (0, 8), (0, 9), (0, 10)]: 741 | return 0 742 | if (sx, sy) in [(2, 6), (3, 6), (4, 6), 743 | (2, 7), (3, 7), (4, 7), 744 | (2, 8), (3, 8), (4, 8), 745 | (2, 9), (3, 9), (4, 9), 746 | (2, 10), (3, 10), (4, 10), (5, 9)]: 747 | return 2 748 | if (sx, sy) in [(1, 10), (1, 9), (1, 8), (1, 7), (1, 6), (1, 5)]: 749 | return 3 750 | 751 | elif option == 7: 752 | if (sx, sy) in [(0, 9), (1, 9), (2, 9), (3, 9), (4, 9), (5, 9)]: 753 | return 0 754 | if (sx, sy) in [(1, 5), (1, 6), (1, 7), (1, 8), (0, 6), (0, 7), (0, 8), 755 | (2, 6), (2, 7), (2, 8), (3, 6), (3, 7), (3, 8), 756 | (4, 6), (4, 7), (4, 8)]: 757 | return 1 758 | if (sx, sy) in [(0, 10), (1, 10), (2, 10), (3, 10), (4, 10)]: 759 | return 3 760 | 761 | print state_int 762 | print option 763 | print "!!!!!" 764 | raise ValueError("Unexpected state.") 765 | 766 | def optimal_policy(self, state_int): 767 | """ 768 | The optimal policy for this gridworld. 769 | 770 | state_int: What state we are in. int. 771 | -> Action int. 772 | """ 773 | 774 | sx, sy = self.int_to_point(state_int) 775 | 776 | if sx < self.grid_size and sy < self.grid_size: 777 | return rn.randint(0, 2) 778 | if sx < self.grid_size - 1: 779 | return 0 780 | if sy < self.grid_size - 1: 781 | return 1 782 | raise ValueError("Unexpected state.") 783 | 784 | def optimal_policy_deterministic(self, state_int): 785 | """ 786 | Deterministic version of the optimal policy for this gridworld. 787 | 788 | state_int: What state we are in. int. 789 | -> Action int. 790 | """ 791 | 792 | sx, sy = self.int_to_point(state_int) 793 | if sx < sy: 794 | return 0 795 | return 1 796 | 797 | def generate_option_option_trajectories(self, trajectories, n_trajectories, option_policy, action_policy, random_start=False): 798 | """ 799 | Generate n_trajectories trajectories with length trajectory_length, 800 | following the given policy. 801 | 802 | n_trajectories: Number of trajectories. int. 803 | trajectory_length: Length of an episode. int. 804 | policy: Map from state integers to action integers. 805 | random_start: Whether to start randomly (default False). bool. 806 | -> [[(state int, action int, reward float)]] 807 | """ 808 | 809 | generated_trajectories = [] 810 | for _ in range(n_trajectories): 811 | if random_start: 812 | sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size) 813 | else: 814 | sx, sy = 0, 0 815 | 816 | trajectory = [] 817 | option = self.options[0] 818 | path_idx = np.random.choice(len(trajectories[option["id"]]), 1) 819 | local_path = trajectories[option["id"]][path_idx][0] 820 | acc_reward = np.sum(local_path[:, 2]) 821 | states = local_path[:, 0] 822 | state_int = local_path[-1, 0] 823 | while self.int_to_point(state_int) != option["termination"]: 824 | new_point = self.generate_intra_option_trajectories( 825 | 1, 2, action_policy, option, 826 | predefined_start=self.int_to_point(state_int))[0][1] 827 | states = np.append(states, new_point[0]) 828 | state_int = states[-1] 829 | 830 | sx, sy = self.int_to_point(states[-1]) 831 | trajectory.append((states, option["id"], acc_reward)) 832 | for _ in range(len(trajectories[0][0])/2 - 1): 833 | states = [] 834 | if rn.random() < self.wind: 835 | # save this for generalised variant 836 | # _available_options = [opt['id'] for opt in self.options if self.point_to_int((sx, sy)) in self.rooms[opt["room"]]] 837 | _available_options = [opt['id'] for opt in self.options if (sx, sy) == opt["init_set"]] 838 | option = self.options[np.random.choice(_available_options, 1)[0]] 839 | else: 840 | # Follow the given policy. 841 | option = self.options[option_policy(option["id"])] 842 | 843 | if (sx, sy) != self.options[2]["termination"]: 844 | path_idx = np.random.choice(len(trajectories[option["id"]]), 1) 845 | local_path = trajectories[option["id"]][path_idx][0] 846 | acc_reward = np.sum(local_path[:, 2]) 847 | states = local_path[:, 0] 848 | state_int = local_path[-1, 0] 849 | while self.int_to_point(state_int) != option["termination"]: 850 | new_point = self.generate_intra_option_trajectories( 851 | 1, 2, action_policy, option, 852 | predefined_start=self.int_to_point(state_int))[0][1] 853 | states = np.append(states, new_point[0]) 854 | acc_reward += new_point[2] 855 | state_int = states[-1] 856 | 857 | sx, sy = self.int_to_point(states[-1]) 858 | trajectory.append((states, option["id"], acc_reward)) 859 | 860 | generated_trajectories.append(trajectory) 861 | 862 | return np.array(generated_trajectories) 863 | 864 | 865 | def generate_intra_option_trajectories(self, n_trajectories, trajectory_length, policy, 866 | option, predefined_start=None, random_start=False): 867 | """ 868 | Generate n_trajectories trajectories with length trajectory_length, 869 | following the given policy. 870 | 871 | n_trajectories: Number of trajectories. int. 872 | trajectory_length: Length of an episode. int. 873 | policy: Map from state integers to action integers. 874 | random_start: Whether to start randomly (default False). bool. 875 | -> [[(state int, action int, reward float)]] 876 | """ 877 | 878 | trajectories = [] 879 | for _ in range(n_trajectories): 880 | if random_start: 881 | sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size) 882 | elif predefined_start != None: 883 | sx, sy = predefined_start 884 | else: 885 | if option["id"] == 0 or option["id"] == 7: 886 | sx, sy = self.int_to_point(56) 887 | elif option["id"] == 2 or option["id"] == 1: 888 | sx, sy = self.int_to_point(27) 889 | elif option["id"] == 3 or option["id"] == 4: 890 | sx, sy = self.int_to_point(74) 891 | else: 892 | sx, sy = self.int_to_point(104) 893 | 894 | trajectory = [] 895 | for _ in range(trajectory_length): 896 | if rn.random() < self.wind: 897 | action = self.actions[rn.randint(0, 4)] 898 | else: 899 | # Follow the given policy. 900 | action = self.actions[policy(self.point_to_int((sx, sy)), option["id"])] 901 | 902 | if (((sx + action[0], 903 | sy + action[1]) == option["termination"]) or ( 904 | option["min"][0] < sx + action[0] < option["max"][0] and 905 | option["min"][1] < sy + action[1] < option["max"][1]) 906 | ): 907 | next_sx = sx + action[0] 908 | next_sy = sy + action[1] 909 | else: 910 | next_sx = sx 911 | next_sy = sy 912 | 913 | state_int = self.point_to_int((sx, sy)) 914 | action_int = self.actions.index(action) 915 | next_state_int = self.point_to_int((next_sx, next_sy)) 916 | reward = self.reward(next_state_int) # do not hardcode option 917 | trajectory.append((state_int, action_int, reward)) 918 | 919 | sx = next_sx 920 | sy = next_sy 921 | 922 | trajectories.append(trajectory) 923 | 924 | return np.array(trajectories) 925 | 926 | def generate_trajectories(self, n_trajectories, trajectory_length, policy, 927 | random_start=False): 928 | """ 929 | Generate n_trajectories trajectories with length trajectory_length, 930 | following the given policy. 931 | 932 | n_trajectories: Number of trajectories. int. 933 | trajectory_length: Length of an episode. int. 934 | policy: Map from state integers to action integers. 935 | random_start: Whether to start randomly (default False). bool. 936 | -> [[(state int, action int, reward float)]] 937 | """ 938 | 939 | trajectories = [] 940 | for _ in range(n_trajectories): 941 | if random_start: 942 | sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size) 943 | else: 944 | sx, sy = 0, 0 945 | 946 | trajectory = [] 947 | for _ in range(trajectory_length): 948 | if rn.random() < self.wind: 949 | action = self.actions[rn.randint(0, 4)] 950 | else: 951 | # Follow the given policy. 952 | action = self.actions[policy(self.point_to_int((sx, sy)))] 953 | 954 | if (0 <= sx + action[0] < self.grid_size and 955 | 0 <= sy + action[1] < self.grid_size): 956 | next_sx = sx + action[0] 957 | next_sy = sy + action[1] 958 | else: 959 | next_sx = sx 960 | next_sy = sy 961 | 962 | state_int = self.point_to_int((sx, sy)) 963 | action_int = self.actions.index(action) 964 | next_state_int = self.point_to_int((next_sx, next_sy)) 965 | reward = self.reward(next_state_int)# do not hardcode option 966 | trajectory.append((state_int, action_int, reward)) 967 | 968 | sx = next_sx 969 | sy = next_sy 970 | 971 | trajectories.append(trajectory) 972 | 973 | return np.array(trajectories) 974 | --------------------------------------------------------------------------------