├── irl
    ├── __init__.py
    ├── mdp
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── gridworld.pyc
    │   ├── gridworld_test.py
    │   ├── objectworld.py
    │   └── gridworld.py
    ├── maxent.pyc
    ├── __init__.pyc
    ├── value_iteration.pyc
    ├── value_iteration.py
    ├── linear_irl.py
    ├── maxent.py
    └── deep_maxent.py
├── hierarchicalrl
    ├── sdp_maxent.pyc
    ├── options_grid_test.pyc
    ├── options_grid_world.pyc
    ├── sdp_value_iteration.pyc
    ├── options_grid_test.py
    ├── options_maxent.py
    ├── optionsUsing-nopid.py
    ├── sdp_value_iteration.py
    ├── sdp_maxent.py
    └── options_grid_world.py
├── LICENSE
├── examples
    ├── lp_gridworld.py
    ├── maxent_gridworld.py
    ├── lp_large_gridworld.py
    ├── maxent_objectworld.py
    ├── deep_maxent_objectworld.py
    └── experiments.py
├── README.md
└── options-using-q
    ├── basicOptions.py
    ├── qLearning.py
    ├── optionsUsing.py
    ├── optionsUsing-nopid.py
    ├── basicOption-tworooms.py
    └── options-temp.py


/irl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/irl/mdp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/irl/maxent.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/maxent.pyc


--------------------------------------------------------------------------------
/irl/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/__init__.pyc


--------------------------------------------------------------------------------
/irl/mdp/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/mdp/__init__.pyc


--------------------------------------------------------------------------------
/irl/mdp/gridworld.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/mdp/gridworld.pyc


--------------------------------------------------------------------------------
/irl/value_iteration.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/irl/value_iteration.pyc


--------------------------------------------------------------------------------
/hierarchicalrl/sdp_maxent.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/hierarchicalrl/sdp_maxent.pyc


--------------------------------------------------------------------------------
/hierarchicalrl/options_grid_test.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/hierarchicalrl/options_grid_test.pyc


--------------------------------------------------------------------------------
/hierarchicalrl/options_grid_world.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/hierarchicalrl/options_grid_world.pyc


--------------------------------------------------------------------------------
/hierarchicalrl/sdp_value_iteration.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdavchev/Hierarchical-Inverse-Reinforcement-Learning/HEAD/hierarchicalrl/sdp_value_iteration.pyc


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Matthew Alger
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/irl/mdp/gridworld_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit tests for the gridworld MDP.
 3 | 
 4 | Matthew Alger, 2016
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import unittest
 9 | 
10 | import numpy as np
11 | import numpy.random as rn
12 | 
13 | import gridworld
14 | 
15 | 
16 | def make_random_gridworld():
17 |     grid_size = rn.randint(2, 15)
18 |     wind = rn.uniform(0.0, 1.0)
19 |     discount = rn.uniform(0.0, 1.0)
20 |     return gridworld.Gridworld(grid_size, wind, discount)
21 | 
22 | 
23 | class TestTransitionProbability(unittest.TestCase):
24 |     """Tests for Gridworld.transition_probability."""
25 | 
26 |     def test_sums_to_one(self):
27 |         """Tests that the sum of transition probabilities is approximately 1."""
28 |         # This is a simple fuzz-test.
29 |         for _ in range(40):
30 |             gw = make_random_gridworld()
31 |             self.assertTrue(
32 |                 np.isclose(gw.transition_probability.sum(axis=2), 1).all(),
33 |                 'Probabilities don\'t sum to 1: {}'.format(gw))
34 | 
35 |     def test_manual_sums_to_one(self):
36 |         """Tests issue #1 on GitHub."""
37 |         gw = gridworld.Gridworld(5, 0.3, 0.2)
38 |         self.assertTrue(
39 |             np.isclose(gw.transition_probability.sum(axis=2), 1).all())
40 | 
41 | if __name__ == '__main__':
42 |     unittest.main()


--------------------------------------------------------------------------------
/examples/lp_gridworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run linear programming inverse reinforcement learning on the gridworld MDP.
 3 | 
 4 | Matthew Alger, 2015
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | import irl.linear_irl as linear_irl
12 | import irl.mdp.gridworld as gridworld
13 | 
14 | def main(grid_size, discount):
15 |     """
16 |     Run linear programming inverse reinforcement learning on the gridworld MDP.
17 | 
18 |     Plots the reward function.
19 | 
20 |     grid_size: Grid size. int.
21 |     discount: MDP discount factor. float.
22 |     """
23 | 
24 |     wind = 0.3
25 |     trajectory_length = 3*grid_size
26 | 
27 |     gw = gridworld.Gridworld(grid_size, wind, discount)
28 | 
29 |     ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
30 |     policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]
31 |     r = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability,
32 |             policy, gw.discount, 1, 5)
33 | 
34 |     plt.subplot(1, 2, 1)
35 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
36 |     plt.colorbar()
37 |     plt.title("Groundtruth reward")
38 |     plt.subplot(1, 2, 2)
39 |     plt.pcolor(r.reshape((grid_size, grid_size)))
40 |     plt.colorbar()
41 |     plt.title("Recovered reward")
42 |     plt.show()
43 | 
44 | if __name__ == '__main__':
45 |     main(5, 0.2)
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hierarchical Inverse Reinforcement Learning
 2 | 
 3 | Extends [M. Alger's](https://doi.org/10.5281/zenodo.555999) implementation of selected inverse reinforcement learning (IRL) algorithms. A summary report of my work is available [here](https://www.overleaf.com/read/mkkfqgpnbvnr). His final report is available [here](http://matthewja.com/pdfs/irl.pdf) and describes the implemented algorithms.
 4 | 
 5 | If you use this code in your work, you can cite it as follows:
 6 | ```bibtex
 7 | @misc{davchev17,
 8 |   author       = {Todor Davchev},
 9 |   title        = {Hierarchical Inverse Reinforcement Learning},
10 |   year         = 2017
11 | }
12 | ```
13 | If you are only interested in the IRL aspect of this project, you can find at [Alger's repo](https://github.com/MatthewJA/Inverse-Reinforcement-Learning)
14 | ## Algorithms implemented
15 | 
16 | - Linear programming IRL. From Ng & Russell, 2000. Small state space and large state space linear programming IRL.
17 | - Maximum entropy IRL. From Ziebart et al., 2008.
18 | - Deep maximum entropy IRL. From Wulfmeier et al., 2015; original derivation.
19 | - Hierarchical MaxEnt IRL.
20 | 
21 | Additionally, the following MDP and semi-MDP domains are implemented:
22 | - Gridworld (Sutton, 1998)
23 | - Extended Gridworld with options (Sutton, 1998)
24 | - Objectworld (Levine et al., 2011)
25 | 
26 | ## Requirements
27 | - NumPy
28 | - SciPy
29 | - CVXOPT
30 | - Theano
31 | - MatPlotLib (for examples)
32 | 


--------------------------------------------------------------------------------
/examples/maxent_gridworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run maximum entropy inverse reinforcement learning on the gridworld MDP.
 3 | 
 4 | Matthew Alger, 2015
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | import sys
12 | sys.path.append("/Users/todordavchev/Documents/temp/")
13 | 
14 | import irl.maxent as maxent
15 | import irl.mdp.gridworld as gridworld
16 | 
17 | def main(grid_size, discount, n_trajectories, epochs, learning_rate):
18 |     """
19 |     Run maximum entropy inverse reinforcement learning on the gridworld MDP.
20 | 
21 |     Plots the reward function.
22 | 
23 |     grid_size: Grid size. int.
24 |     discount: MDP discount factor. float.
25 |     n_trajectories: Number of sampled trajectories. int.
26 |     epochs: Gradient descent iterations. int.
27 |     learning_rate: Gradient descent learning rate. float.
28 |     """
29 | 
30 |     wind = 0.3
31 |     trajectory_length = 3*grid_size
32 | 
33 |     gw = gridworld.Gridworld(grid_size, wind, discount)
34 |     trajectories = gw.generate_trajectories(n_trajectories,
35 |                                             trajectory_length,
36 |                                             gw.optimal_policy)
37 |     feature_matrix = gw.feature_matrix()
38 |     ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
39 |     r = maxent.irl(feature_matrix, gw.n_actions, discount,
40 |         gw.transition_probability, trajectories, epochs, learning_rate)
41 | 
42 |     plt.subplot(1, 2, 1)
43 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
44 |     plt.colorbar()
45 |     plt.title("Groundtruth reward")
46 |     plt.subplot(1, 2, 2)
47 |     plt.pcolor(r.reshape((grid_size, grid_size)))
48 |     plt.colorbar()
49 |     plt.title("Recovered reward")
50 |     plt.show()
51 | 
52 | if __name__ == '__main__':
53 |     main(5, 0.01, 20, 200, 0.01)
54 | 


--------------------------------------------------------------------------------
/hierarchicalrl/options_grid_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit tests for the gridworld MDP.
 3 | 
 4 | Todor Davchev, 2017
 5 | t.b.davchev@ed.ac.uk
 6 | """
 7 | 
 8 | import unittest
 9 | 
10 | import numpy as np
11 | import numpy.random as rn
12 | 
13 | import options_grid_world as gridworld
14 | 
15 | 
16 | def make_random_gridworld():
17 |     grid_size = rn.randint(2, 15)
18 |     wind = rn.uniform(0.0, 1.0)
19 |     discount = rn.uniform(0.0, 1.0)
20 |     return gridworld.Gridworld(grid_size, wind, discount)
21 | 
22 | 
23 | class TestTransitionProbability(unittest.TestCase):
24 |     """Tests for Gridworld.transition_probability."""
25 | 
26 |     # def test_sums_to_one(self):
27 |     #     """Tests that the sum of transition probabilities is approximately 1."""
28 |     #     # This is a simple fuzz-test.
29 |     #     for _ in range(40):
30 |     #         gw = make_random_gridworld()
31 |     #         self.assertTrue(
32 |     #             np.isclose(gw.transition_probability.sum(axis=2), 1).all(),
33 |     #             'Probabilities don\'t sum to 1: {}'.format(gw))
34 | 
35 |     def test_manual_sums_to_one(self):
36 |         """Tests issue #1 on GitHub."""
37 |         walls = [
38 |             (5, 0), (5, 1), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 10),
39 |             (0, 5), (2, 5), (3, 5), (4, 5),
40 |             (6, 6), (7, 6), (9, 6), (10, 6)
41 |         ]
42 |         gw = gridworld.Large_Gridworld(11, walls, 0.3, 0.2)
43 |         self.assertTrue(
44 |             np.isclose(gw.options_transition_probability.sum(axis=2), 1).all())
45 | 
46 |         # take out all walls since their probabilities == 0
47 |         bb = gw.improved_transition_probability.sum(axis=3)
48 |         aa = gw.transition_probability.sum(axis=2)
49 |         self.assertTrue(
50 |             np.isclose([x for i, x in enumerate(aa) if x.all() != 0.], 1).all())
51 | 
52 | if __name__ == '__main__':
53 |     unittest.main()


--------------------------------------------------------------------------------
/examples/lp_large_gridworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run large state space linear programming inverse reinforcement learning on the
 3 | gridworld MDP.
 4 | 
 5 | Matthew Alger, 2015
 6 | matthew.alger@anu.edu.au
 7 | """
 8 | 
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | 
12 | import irl.linear_irl as linear_irl
13 | import irl.mdp.gridworld as gridworld
14 | from irl.value_iteration import value
15 | 
16 | def main(grid_size, discount):
17 |     """
18 |     Run large state space linear programming inverse reinforcement learning on
19 |     the gridworld MDP.
20 | 
21 |     Plots the reward function.
22 | 
23 |     grid_size: Grid size. int.
24 |     discount: MDP discount factor. float.
25 |     """
26 | 
27 |     wind = 0.3
28 |     trajectory_length = 3*grid_size
29 | 
30 |     gw = gridworld.Gridworld(grid_size, wind, discount)
31 | 
32 |     ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
33 |     policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]
34 | 
35 |     # Need a value function for each basis function.
36 |     feature_matrix = gw.feature_matrix()
37 |     values = []
38 |     for dim in range(feature_matrix.shape[1]):
39 |         reward = feature_matrix[:, dim]
40 |         values.append(value(policy, gw.n_states, gw.transition_probability,
41 |                             reward, gw.discount))
42 |     values = np.array(values)
43 | 
44 |     r = linear_irl.large_irl(values, gw.transition_probability,
45 |                         feature_matrix, gw.n_states, gw.n_actions, policy)
46 | 
47 |     plt.subplot(1, 2, 1)
48 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
49 |     plt.colorbar()
50 |     plt.title("Groundtruth reward")
51 |     plt.subplot(1, 2, 2)
52 |     plt.pcolor(r.reshape((grid_size, grid_size)))
53 |     plt.colorbar()
54 |     plt.title("Recovered reward")
55 |     plt.show()
56 | 
57 | if __name__ == '__main__':
58 |     main(10, 0.9)
59 | 


--------------------------------------------------------------------------------
/examples/maxent_objectworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run maximum entropy inverse reinforcement learning on the objectworld MDP.
 3 | 
 4 | Matthew Alger, 2015
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | import sys
12 | sys.path.append("/home/todor/Documents/workspace/smdp")
13 | 
14 | import irl.maxent as maxent
15 | import irl.mdp.objectworld as objectworld
16 | from irl.value_iteration import find_policy
17 | 
18 | def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
19 |          learning_rate):
20 |     """
21 |     Run maximum entropy inverse reinforcement learning on the objectworld MDP.
22 | 
23 |     Plots the reward function.
24 | 
25 |     grid_size: Grid size. int.
26 |     discount: MDP discount factor. float.
27 |     n_objects: Number of objects. int.
28 |     n_colours: Number of colours. int.
29 |     n_trajectories: Number of sampled trajectories. int.
30 |     epochs: Gradient descent iterations. int.
31 |     learning_rate: Gradient descent learning rate. float.
32 |     """
33 | 
34 |     wind = 0.3
35 |     trajectory_length = 8
36 | 
37 |     ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
38 |                                  discount)
39 |     ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
40 |     policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability,
41 |                          ground_r, ow.discount, stochastic=False)
42 |     trajectories = ow.generate_trajectories(n_trajectories,
43 |                                             trajectory_length,
44 |                                             lambda s: policy[s])
45 |     feature_matrix = ow.feature_matrix(discrete=False)
46 |     r = maxent.irl(feature_matrix, ow.n_actions, discount,
47 |         ow.transition_probability, trajectories, epochs, learning_rate)
48 | 
49 |     plt.subplot(1, 2, 1)
50 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
51 |     plt.colorbar()
52 |     plt.title("Groundtruth reward")
53 |     plt.subplot(1, 2, 2)
54 |     plt.pcolor(r.reshape((grid_size, grid_size)))
55 |     plt.colorbar()
56 |     plt.title("Recovered reward")
57 |     plt.show()
58 | 
59 | if __name__ == '__main__':
60 |     main(10, 0.9, 15, 2, 20, 50, 0.01)
61 | 


--------------------------------------------------------------------------------
/examples/deep_maxent_objectworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run maximum entropy inverse reinforcement learning on the objectworld MDP.
 3 | 
 4 | Matthew Alger, 2015
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | import sys
12 | sys.path.append("/home/todor/Documents/workspace/smdp")
13 | 
14 | import irl.deep_maxent as deep_maxent
15 | import irl.mdp.objectworld as objectworld
16 | from irl.value_iteration import find_policy
17 | 
18 | def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
19 |          learning_rate, structure):
20 |     """
21 |     Run deep maximum entropy inverse reinforcement learning on the objectworld
22 |     MDP.
23 | 
24 |     Plots the reward function.
25 | 
26 |     grid_size: Grid size. int.
27 |     discount: MDP discount factor. float.
28 |     n_objects: Number of objects. int.
29 |     n_colours: Number of colours. int.
30 |     n_trajectories: Number of sampled trajectories. int.
31 |     epochs: Gradient descent iterations. int.
32 |     learning_rate: Gradient descent learning rate. float.
33 |     structure: Neural network structure. Tuple of hidden layer dimensions, e.g.,
34 |         () is no neural network (linear maximum entropy) and (3, 4) is two
35 |         hidden layers with dimensions 3 and 4.
36 |     """
37 | 
38 |     wind = 0.3
39 |     trajectory_length = 8
40 |     l1 = l2 = 0
41 | 
42 |     ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
43 |                                  discount)
44 |     ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
45 |     policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability,
46 |                          ground_r, ow.discount, stochastic=False)
47 |     trajectories = ow.generate_trajectories(n_trajectories,
48 |                                             trajectory_length,
49 |                                             lambda s: policy[s])
50 |     feature_matrix = ow.feature_matrix(discrete=False)
51 |     r = deep_maxent.irl((feature_matrix.shape[1],) + structure, feature_matrix,
52 |         ow.n_actions, discount, ow.transition_probability, trajectories, epochs,
53 |         learning_rate, l1=l1, l2=l2)
54 | 
55 |     plt.subplot(1, 2, 1)
56 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
57 |     plt.colorbar()
58 |     plt.title("Groundtruth reward")
59 |     plt.subplot(1, 2, 2)
60 |     plt.pcolor(r.reshape((grid_size, grid_size)))
61 |     plt.colorbar()
62 |     plt.title("Recovered reward")
63 |     plt.show()
64 | 
65 | if __name__ == '__main__':
66 |     main(10, 0.9, 15, 2, 20, 50, 0.01, (3, 3))
67 | 


--------------------------------------------------------------------------------
/options-using-q/basicOptions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | 
  5 | Sx = 7
  6 | Sy = 7
  7 | S = Sx*Sy
  8 | P = 5 # there is a state for being in the taxi
  9 | G = 4
 10 | R = 0
 11 | O = 2
 12 | maxR = -999999
 13 | hallways = [44, 27]
 14 | rooms = [[
 15 |     8, 9, 10, 11, 12,
 16 |     15, 16 ,17, 18, 19,
 17 |     22, 23, 24, 25, 26, 27,
 18 |     36, 37, 38, 39, 40,
 19 |     44
 20 | ]]
 21 | 
 22 | walls = [[
 23 |     0, 1, 2, 3, 4, 5, 6,
 24 |     7, 14, 21, 28, 35, 42,
 25 |     43, 45, 46, 47, 48,
 26 |     13, 20, 34, 41, 48,
 27 |     51
 28 | ]]
 29 | 
 30 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3]
 31 | pickUps = [44, 27]
 32 | A = 6
 33 | T = 3000
 34 | stepNo = 0
 35 | avg_reward = np.zeros([Sx, Sy, P, G])
 36 | reward = np.zeros([P, G])
 37 | avg = np.zeros([T,1])
 38 | time_course = np.zeros([T, 3])
 39 | Q = 0.1*np.random.rand(S, O, A, P, G)
 40 | for i in xrange(49):
 41 |     for o in xrange(O):
 42 |         for a in xrange(A):
 43 |             for p in xrange(P):
 44 |                 for g in xrange(G):
 45 |                     if i not in rooms[0]:
 46 |                         Q[i, o, a, p, g] = 0
 47 | 
 48 | V = [np.max(Q[:, o, :], axis=1) for o in xrange(O)]
 49 | eta = 0.1
 50 | gamma = 0.9
 51 | epsilon = 0.1
 52 | reward_course = np.zeros([T, 1])
 53 | reward_mean = np.zeros([T, 1])
 54 | 
 55 | option = 0
 56 | 
 57 | stepsToGoal = np.zeros([T, 1])
 58 | maxV = -9999
 59 | for t in xrange(T):
 60 |     plocation = pickUps[1 - option]
 61 |     pID = [i for i, x in enumerate(pickUps) if x == plocation][0]
 62 |     Goal = pickUps[option]
 63 |     p0 = plocation
 64 |     gID = [i for i, x in enumerate(pickUps) if x == Goal][0]
 65 |     s0 = np.random.choice([state for state in xrange(S) if state not in walls[0]])
 66 |     state = [s0, pID, gID] #[{1..25} {1..5} {1..4}]
 67 |     for u in xrange(S**2):
 68 |         if (stepNo > 30):
 69 |             stepNo = 0
 70 |             break
 71 | 
 72 |         r = 0
 73 |         [V[option][s0, pID, gID], a0] = [np.max(Q[s0, option, :, pID, gID]), np.argmax(Q[s0, option, :, pID, gID])]
 74 |         if (np.random.rand(1) < epsilon):
 75 |             a0 = np.random.choice(A)
 76 | 
 77 | 
 78 |         if a0 == 4:
 79 |             if pID != 4:
 80 |                 if s0 == pickUps[pID]:
 81 |                     r = 1
 82 |                     pID = 4
 83 |                     stepNo = 0
 84 |                 else:
 85 |                     r = -1
 86 |             else:
 87 |                 r = -1
 88 | 
 89 | 
 90 |         if a0 == 5:
 91 |             if (s0 == pickUps[gID]) and pID==4:
 92 |                 stepsToGoal[t] = stepNo
 93 |                 r = 10/float(stepNo)
 94 |                 if maxR < r:
 95 |                     maxR = r
 96 | 
 97 |                 stepNo = 0
 98 |             else:
 99 |                 r = -1
100 | 
101 | 
102 |         if a0 == 0:
103 |             s1 = s0 - Sx
104 |             if s1 in walls[0]:
105 |                 s1 = s1 + Sx
106 |                 r = -1
107 | 
108 | 
109 |         if a0 == 1:
110 |             s1 = s0 + Sx
111 |             if s1 in walls[0]:
112 |                 s1 = s1 - Sx
113 |                 r = -1
114 | 
115 | 
116 |         if a0 == 2:
117 |             s1 = s0 - 1
118 |             if s1 in walls[0]:
119 |                 s1 = s1 + 1
120 |                 r = -1
121 | 
122 | 
123 |         if a0 == 3:
124 |             s1 = s0 + 1
125 |             if s1 in walls[0]:
126 |                 s1 = s1 - 1
127 |                 r = -1
128 | 
129 |         if a0 == 4:
130 |             s1 = s0
131 | 
132 | 
133 |         if a0 == 5:
134 |             s1 = s0
135 | 
136 | 
137 |         # learning step
138 |         if t > 1000:
139 |             R += r
140 | 
141 | 
142 |         # print r
143 |         FullR = R + r
144 |         reward_course[t] = r
145 |         reward_mean[t] = R/float(t+1)
146 | 
147 | 
148 |         V[option][s1, pID, gID] = np.max(Q[s1, option, :, pID, gID])
149 | 
150 | 
151 |         if maxV < V[option][s1, pID, gID]:
152 |             maxV = V[option][s1, pID, gID]
153 | 
154 | 
155 |         time_course[t, 0] = V[option][s1, pID, gID]
156 |         time_course[t, 1] = eta*(r+gamma*V[option][s1, pID, gID])
157 |         time_course[t, 2] = (1-eta)*Q[s0, option, a0, pID, gID]
158 |         Q[s0, option, a0, pID, gID] = (1-eta)*Q[s0, option, a0, pID, gID] + \
159 |             eta*(r + gamma*V[option][s1, pID, gID])
160 |         if pID == 4:
161 |             stepNo += 1
162 | 
163 | 
164 |         if (s0 == pickUps[gID]) and (a0 == 5):
165 |             stepNo = 0
166 |             break
167 | 
168 | 
169 |         s0 = s1
170 | 
171 | 
172 |     avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :]))))
173 | 
174 | 
175 | meanR = R/float(T-1000)
176 | fullMR = FullR/float(T)
177 | print meanR
178 | print fullMR
179 | print maxV
180 | policy = [np.max(Q[i, option, :, pID, gID]) for i in xrange(S)]
181 | policy_actions = [np.argmax(Q[i, option, :, pID, gID]) for i in xrange(S)]
182 | print len(policy)
183 | policy_actions = np.reshape(policy_actions, [7, 7])
184 | 
185 | 
186 | for i in xrange(7):
187 |     for j in xrange(7):
188 |         print "{0} ".format(policy_actions[i, j]),
189 | 
190 |     print " "


--------------------------------------------------------------------------------
/irl/value_iteration.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Find the value function associated with a policy. Based on Sutton & Barto, 1998.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | import numpy as np
  9 | 
 10 | def value(policy, n_states, transition_probabilities, reward, discount,
 11 |                     threshold=1e-2):
 12 |     """
 13 |     Find the value function associated with a policy.
 14 | 
 15 |     policy: List of action ints for each state.
 16 |     n_states: Number of states. int.
 17 |     transition_probabilities: Function taking (state, action, state) to
 18 |         transition probabilities.
 19 |     reward: Vector of rewards for each state.
 20 |     discount: MDP discount factor. float.
 21 |     threshold: Convergence threshold, default 1e-2. float.
 22 |     -> Array of values for each state
 23 |     """
 24 |     v = np.zeros(n_states)
 25 | 
 26 |     diff = float("inf")
 27 |     while diff > threshold:
 28 |         diff = 0
 29 |         for s in range(n_states):
 30 |             vs = v[s]
 31 |             a = policy[s]
 32 |             v[s] = sum(transition_probabilities[s, a, k] *
 33 |                        (reward[k] + discount * v[k])
 34 |                        for k in range(n_states))
 35 |             diff = max(diff, abs(vs - v[s]))
 36 | 
 37 |     return v
 38 | 
 39 | def optimal_value(n_states, n_actions, transition_probabilities, reward,
 40 |                   discount, threshold=1e-2):
 41 |     """
 42 |     Find the optimal value function.
 43 | 
 44 |     n_states: Number of states. int.
 45 |     n_actions: Number of actions. int.
 46 |     transition_probabilities: Function taking (state, action, state) to
 47 |         transition probabilities.
 48 |     reward: Vector of rewards for each state.
 49 |     discount: MDP discount factor. float.
 50 |     threshold: Convergence threshold, default 1e-2. float.
 51 |     -> Array of values for each state
 52 |     """
 53 | 
 54 |     v = np.zeros(n_states)
 55 | 
 56 |     diff = float("inf")
 57 |     while diff > threshold:
 58 |         diff = 0
 59 |         for s in range(n_states):
 60 |             max_v = float("-inf")
 61 |             for a in range(n_actions):
 62 |                 tp = transition_probabilities[s, a, :]
 63 |                 # max_v = max(max_v, sum(reward + np.dot(tp, discount*v)))
 64 |                 max_v = max(max_v, np.dot(tp, reward + discount*v))
 65 | 
 66 |             new_diff = abs(v[s] - max_v)
 67 |             if new_diff > diff:
 68 |                 diff = new_diff
 69 |             v[s] = max_v
 70 | 
 71 |     return v
 72 | 
 73 | def find_policy(n_states, n_actions, transition_probabilities, reward, discount,
 74 |                 threshold=1e-2, v=None, stochastic=True):
 75 |     """
 76 |     Find the optimal policy.
 77 | 
 78 |     n_states: Number of states. int.
 79 |     n_actions: Number of actions. int.
 80 |     transition_probabilities: Function taking (state, action, state) to
 81 |         transition probabilities.
 82 |     reward: Vector of rewards for each state.
 83 |     discount: MDP discount factor. float.
 84 |     threshold: Convergence threshold, default 1e-2. float.
 85 |     v: Value function (if known). Default None.
 86 |     stochastic: Whether the policy should be stochastic. Default True.
 87 |     -> Action probabilities for each state or action int for each state
 88 |         (depending on stochasticity).
 89 |     """
 90 | 
 91 |     if v is None:
 92 |         v = optimal_value(n_states, n_actions, transition_probabilities, reward,
 93 |                           discount, threshold)
 94 | 
 95 |     if stochastic:
 96 |         # Get Q using equation 9.2 from Ziebart's thesis.
 97 |         Q = np.zeros((n_states, n_actions))
 98 |         for i in range(n_states):
 99 |             for j in range(n_actions):
100 |                 p = transition_probabilities[i, j, :]
101 |                 Q[i, j] = p.dot(reward + discount*v)
102 |         Q -= Q.max(axis=1).reshape((n_states, 1))  # For numerical stability.
103 |         Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1))
104 |         return Q
105 | 
106 |     def _policy(s):
107 |         return max(range(n_actions),
108 |                    key=lambda a: sum(transition_probabilities[s, a, k] *
109 |                                      (reward[k] + discount * v[k])
110 |                                      for k in range(n_states)))
111 |     policy = np.array([_policy(s) for s in range(n_states)])
112 |     return policy
113 | 
114 | if __name__ == '__main__':
115 |     # Quick unit test using gridworld.
116 |     import mdp.gridworld as gridworld
117 |     gw = gridworld.Gridworld(3, 0.3, 0.9)
118 |     v = value([gw.optimal_policy_deterministic(s) for s in range(gw.n_states)],
119 |               gw.n_states,
120 |               gw.transition_probability,
121 |               [gw.reward(s) for s in range(gw.n_states)],
122 |               gw.discount)
123 |     assert np.isclose(v,
124 |                       [5.7194282, 6.46706692, 6.42589811,
125 |                        6.46706692, 7.47058224, 7.96505174,
126 |                        6.42589811, 7.96505174, 8.19268666], 1).all()
127 |     opt_v = optimal_value(gw.n_states,
128 |                           gw.n_actions,
129 |                           gw.transition_probability,
130 |                           [gw.reward(s) for s in range(gw.n_states)],
131 |                           gw.discount)
132 |     assert np.isclose(v, opt_v).all()
133 | 


--------------------------------------------------------------------------------
/options-using-q/qLearning.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | 
  5 | Sx = 5
  6 | Sy = 5
  7 | S = Sx*Sy
  8 | P = 5 # there is a state for being in the taxi
  9 | G = 4
 10 | R = 0
 11 | maxR = -999999
 12 | pickUps = [0, Sx-1, S-Sx, S-2]
 13 | A = 6
 14 | T = 3000
 15 | stepNo = 0
 16 | avg_reward = np.zeros([Sx, Sy, P, G])
 17 | # reward = np.zeros([P, G])
 18 | rewards = np.ones((S,A,P,G))
 19 | rewards *= -2
 20 | avg = np.zeros([T,1])
 21 | time_course = np.zeros([T, 3])
 22 | Q = 0.1*np.random.rand(S, A, P, G)
 23 | V = np.max(Q, axis=1)
 24 | eta = 0.1
 25 | gamma = 0.9
 26 | epsilon = 0.1
 27 | reward_course = np.zeros([T, 1])
 28 | reward_mean = np.zeros([T, 1])
 29 | 
 30 | 
 31 | stepsToGoal = np.zeros([T, 1])
 32 | maxV = -9999
 33 | for t in range(T):
 34 |     plocation = 20
 35 |     pID = [i for i, x in enumerate(pickUps) if x == plocation][0]
 36 |     Goal = 24
 37 |     p0 = plocation + 1
 38 |     gID = [i for i, x in enumerate(pickUps) if x == Goal-1][0]
 39 |     s0 = np.random.choice(S)
 40 |     state = [s0, pID, gID] #[{1..25} {1..5} {1..4}]
 41 |     for u in range(S**4):
 42 |         if (stepNo > 30):
 43 |             stepNo = 0
 44 |             break
 45 | 
 46 | 
 47 |         r = 0
 48 |         [V[s0, pID, gID], a0] = [np.max(Q[s0, :, pID, gID]), np.argmax(Q[s0, :, pID, gID])]
 49 |         rewards[s0, a0, pID, gID] = 0
 50 |         if (np.random.rand(1) < epsilon):
 51 |             a0 = np.random.choice(A)
 52 | 
 53 | 
 54 |         if a0 == 4:
 55 |             if pID != 4:
 56 |                 if s0 == pickUps[pID]:
 57 |                     r = 1
 58 |                     rewards[s0, a0, pID, gID] = 1
 59 |                     pID = 4
 60 |                     stepNo = 0
 61 |                 else:
 62 |                     r = -1
 63 |                     rewards[s0, a0, pID, gID] = -1
 64 |             else:
 65 |                 r = -1
 66 |                 rewards[s0, a0, pID, gID] = -1
 67 | 
 68 | 
 69 |         if a0 == 5:
 70 |             if (s0 == pickUps[gID]) and pID==4:
 71 |                 stepsToGoal[t] = stepNo
 72 |                 r = 10/float(stepNo)
 73 |                 rewards[s0, a0, pID, gID] = 10/float(stepNo)
 74 |                 if maxR < r:
 75 |                     maxR = r
 76 | 
 77 | 
 78 |                 stepNo = 0
 79 |             else:
 80 |                 r = -1
 81 |                 rewards[s0, a0, pID, gID] = -1
 82 | 
 83 | 
 84 |         if a0 == 0: #nagore
 85 |             s1 = s0 - Sx
 86 |             if s1 < 0:
 87 |                 s1 = s1 + Sx
 88 |                 r = -1
 89 |                 rewards[s0, a0, pID, gID] = -1
 90 | 
 91 | 
 92 |         if a0 == 1:
 93 |             s1 = s0 + Sx #nadolo
 94 |             if s1 > 24:
 95 |                 s1 = s1 - Sx
 96 |                 r = -1
 97 |                 rewards[s0, a0, pID, gID] = -1
 98 | 
 99 |         if a0 == 2:
100 |             s1 = s0-1 #nalqvo
101 |             if s1==-1 or s1==4 or s1==9 or s1==14 or s1==19: 
102 |                 s1=s1+1
103 |                 r = -1
104 |                 rewards[s0, a0, pID, gID] = -1
105 | 
106 |             if s1==1 or s1==6 or s1==20 or s1==15 or s1==17 or s1==22:
107 |                 s1 = s1+1
108 |                 r = -1
109 |                 rewards[s0, a0, pID, gID] = -1
110 | 
111 |         if a0 == 3:
112 |             s1 = s0 + 1 #nadqsno
113 |             if s1 == 5 or s1 == 10 or s1 == 15 or s1 == 20 or s1==25:
114 |                 s1 = s1 - 1
115 |                 r = -1
116 |                 rewards[s0, a0, pID, gID] = -1
117 | 
118 |             if s1 == 2 or s1 == 7 or s1 == 21 or s1 == 16 or s1 == 18 or s1 == 23:
119 |                 s1 = s1 - 1
120 |                 r = -1
121 |                 rewards[s0, a0, pID, gID] = -1
122 | 
123 | 
124 |         if a0 == 4:
125 |             s1 = s0 
126 | 
127 | 
128 |         if a0 == 5: #vzemi pacient
129 |             s1 = s0 #na gol
130 | 
131 | 
132 |         # learning step
133 |         if t > 1000:
134 |             R += r
135 | 
136 | 
137 |         # print r
138 |         FullR = R + r
139 |         reward_course[t] = r
140 |         reward_mean[t] = R/float(t+1)
141 | 
142 | 
143 |         V[s1, pID, gID] = np.max(Q[s1, :, pID, gID])
144 | 
145 | 
146 |         if maxV < V[s1, pID, gID]:
147 |             maxV = V[s1, pID, gID]
148 | 
149 | 
150 |         time_course[t, 0] = V[s1, pID, gID]
151 |         time_course[t, 1] = eta*(r+gamma*V[s1, pID, gID])
152 |         time_course[t, 2] = (1-eta)*Q[s0, a0, pID, gID]
153 |         Q[s0, a0, pID, gID] = (1-eta)*Q[s0, a0, pID, gID] + eta*(r + gamma*V[s1,pID,gID])
154 |         if pID == 4:
155 |             stepNo += 1
156 | 
157 | 
158 |         if (s0 == pickUps[gID]) and (a0 == 5) and pID == 4:
159 |             stepNo = 0
160 |             break
161 | 
162 | 
163 |         s0 = s1
164 | 
165 | 
166 |     avg[t] = np.mean(np.mean(np.mean(np.mean(Q))))
167 | 
168 | 
169 | meanR = R/float(T-1000)
170 | fullMR = FullR/float(T)
171 | print(meanR)
172 | print(fullMR)
173 | print(maxV)
174 | policy = [np.max(Q[i, :, pID, gID]) for i in range(S)]
175 | policy_actions = [np.argmax(Q[i, :, pID, gID]) for i in range(S)]
176 | print(len(policy))
177 | policy_actions = np.reshape(policy_actions, [5, 5])
178 | policy = np.reshape(policy, [5, 5])
179 | 
180 | for j in range(5):
181 |     for i in range(5):
182 |         print("{0} ".format(policy_actions[j, i]), end=' ')
183 | 
184 |     print(" ")
185 | print("REWARDS:")
186 | i = 0
187 | for j in range(25):
188 |     # for i in range(6):
189 |     print("{0} ".format(rewards[j, i, pID, gID]), end=' ')
190 | 
191 | print(" ")


--------------------------------------------------------------------------------
/hierarchicalrl/options_maxent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Run maximum entropy inverse reinforcement learning on the options gridworld MDP.
  3 | 
  4 | Todor Davchev, 2017
  5 | t.b.davchev@ed.ac.uk
  6 | """
  7 | import matplotlib
  8 | matplotlib.use('Agg')
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | import csv
 12 | 
 13 | import sdp_maxent as maxent
 14 | import options_grid_world as options_gridworld
 15 | 
 16 | 
 17 | def main(grid_size, discount, n_trajectories, epochs, learning_rate):
 18 |     """
 19 |     Run maximum entropy inverse reinforcement learning on the gridworld MDP.
 20 | 
 21 |     Plots the reward function.
 22 | 
 23 |     grid_size: Grid size. int.
 24 |     discount: MDP discount factor. float.
 25 |     n_trajectories: Number of sampled trajectories. int.
 26 |     epochs: Gradient descent iterations. int.
 27 |     learning_rate: Gradient descent learning rate. float.
 28 |     """
 29 | 
 30 |     wind = 0.3
 31 |     trajectory_length = 3*grid_size/2
 32 | 
 33 |     walls = [
 34 |         (5, 0), (5, 1), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 10),
 35 |         (0, 5), (2, 5), (3, 5), (4, 5),
 36 |         (6, 6), (7, 6), (9, 6), (10, 6)
 37 |         ]
 38 | 
 39 |     options = [
 40 |         {'init_set': (1, 5), 'termination': (5, 2), 'room': 0, 'id': 0,
 41 |          "min": (-1, -1), "max": (5, 5)},
 42 |         {'init_set': (5, 2), 'termination': (1, 5), 'room': 0, 'id': 1,
 43 |          "min": (-1, -1), "max": (5, 5)},
 44 |         {'init_set': (5, 2), 'termination': (8, 6), 'room': 1, 'id': 2,
 45 |          "min": (5, -1), "max": (11, 6)},
 46 |         {'init_set': (8, 6), 'termination': (5, 2), 'room': 1, 'id': 3,
 47 |          "min": (5, -1), "max": (11, 6)},
 48 |         {'init_set': (8, 6), 'termination': (5, 9), 'room': 2, 'id': 4,
 49 |          'min': (5, 6), 'max': (11, 11)},
 50 |         {'init_set': (5, 9), 'termination': (8, 6), 'room': 2, 'id': 5,
 51 |          'min': (5, 6), 'max': (11, 11)},
 52 |         {'init_set': (5, 9), 'termination': (1, 5), 'room': 3, 'id': 6,
 53 |          'min': (-1, 5), "max": (5, 11)},
 54 |         {'init_set': (1, 5), 'termination': (5, 9), 'room': 3, 'id': 7,
 55 |          'min': (-1, 5), "max": (5, 11)}
 56 |         ]
 57 | 
 58 |     rooms = [
 59 |         [
 60 |             0, 1, 2, 3, 4,
 61 |             11, 12, 13, 14, 15,
 62 |             22, 23, 24, 25, 26,
 63 |             33, 34, 35, 36, 37,
 64 |             44, 45, 46, 47, 48,
 65 |             56, 27
 66 |         ],
 67 |         [
 68 |             6, 7, 8, 9, 10,
 69 |             17, 18, 19, 20, 21,
 70 |             28, 29, 30, 31, 32,
 71 |             39, 40, 41, 42, 43,
 72 |             50, 51, 52, 53, 54,
 73 |             61, 62, 63, 64, 65,
 74 |             74, 27
 75 |         ],
 76 |         [
 77 |             83, 84, 85, 86, 87,
 78 |             94, 95, 96, 97, 98,
 79 |             105, 106, 107, 108, 109,
 80 |             116, 117, 118, 119, 120,
 81 |             104, 74
 82 |         ],
 83 |         [
 84 |             66, 67, 68, 69, 70,
 85 |             77, 78, 79, 80, 81,
 86 |             88, 89, 90, 91, 92,
 87 |             99, 100, 101, 102, 103,
 88 |             110, 111, 112, 113, 114,
 89 |             56, 104
 90 |         ]
 91 |     ]
 92 |     g_world = options_gridworld.Large_Gridworld(grid_size, walls, options, rooms, wind, discount)
 93 |     trajectories = []
 94 |     for opt in options:
 95 |         trajectories.append(
 96 |             g_world.generate_intra_option_trajectories(
 97 |                 n_trajectories,
 98 |                 trajectory_length,
 99 |                 g_world.intra_option_optimal_policy,
100 |                 opt))
101 | 
102 |     global_trajectories = g_world.generate_option_option_trajectories(
103 |         trajectories, n_trajectories,
104 |         g_world.option_option_optimal_policy,
105 |         g_world.intra_option_optimal_policy)
106 |     feature_matrix = g_world.feature_matrix()
107 |     option_feature_matrix = g_world.o_feature_matrix()
108 |     #the reward needs to be changed not per room but per option..
109 |     ground_r = np.array([g_world.reward(state) for state in range(grid_size**2)])
110 |     ground_opt_r = np.array([g_world.opt_reward(opt) for opt in range(len(options))])
111 |     options_states = [rooms[opts["room"]] for opts in options]
112 |     print("Compute the reward.")
113 |     reward, o_reward = maxent.irl(
114 |         options_states, feature_matrix,
115 |         option_feature_matrix, g_world.n_actions,
116 |         g_world.n_options, discount, g_world.options_transition_probability,
117 |         g_world.improved_transition_probability, trajectories, global_trajectories,
118 |         epochs, learning_rate, g_world.int_to_point, options)
119 |     result = np.zeros((len(options),grid_size**2))
120 |     option_result = np.zeros(8)
121 |     writer = csv.writer(open("results/results.csv", 'w'))
122 |     with open("results/opt_results.csv", 'wb') as csvfile:
123 |         opt_writer = csv.writer(csvfile)
124 |         opt_writer.writerow(o_reward)
125 | 
126 |     with open("results/results.csv", 'wb') as csvfile:
127 |         writer = csv.writer(csvfile)
128 |         for o in range(len(options)):
129 |             for broi, value in enumerate(options_states[o]):
130 |                 result[o][value] = reward[o][broi]
131 |             writer.writerow(result[o])
132 | 
133 | # plt.savefig('/tmp/test.png')
134 |         # plt.subplot(1, 2, 1)
135 |         # plt.pcolor(ground_r.reshape((grid_size, grid_size)))
136 |         # plt.colorbar()
137 |         # plt.title("Groundtruth reward")
138 |         # plt.subplot(1, 2, 2)
139 |         # plt.pcolor(result[o].reshape((grid_size, grid_size)))
140 |         # plt.colorbar()
141 |         # plt.title("Recovered reward")
142 | 
143 |     # with open('thefile.csv', 'rb') as f:
144 |     #     data = list(csv.reader(f))
145 |             
146 | 
147 |     plt.subplot(1, 2, 1)
148 |     plt.pcolor(ground_opt_r.reshape((4, 2)))
149 |     plt.colorbar()
150 |     plt.title("Groundtruth reward")
151 |     plt.subplot(1, 2, 2)
152 |     plt.pcolor(o_reward.reshape((4, 2)))
153 |     plt.colorbar()
154 |     plt.title("Recovered reward")
155 |     plt.show()
156 | 
157 | if __name__ == '__main__':
158 |     main(11, 0.01, 20, 200, 0.01)
159 | 


--------------------------------------------------------------------------------
/options-using-q/optionsUsing.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | 
  5 | Sx = 13
  6 | Sy = 8
  7 | S = Sx*Sy
  8 | P = 5 # there is a state for being in the taxi
  9 | G = 4
 10 | R = 0
 11 | O = 4
 12 | maxR = -999999
 13 | hallways = [80, 45, 100]
 14 | rooms = [[
 15 |     14, 15, 16, 17, 18,
 16 |     27, 28, 29, 30, 31,
 17 |     40, 41, 42, 43, 44, 45,
 18 |     53, 54, 55, 56, 57,
 19 |     66, 67, 68, 69, 70,
 20 |     80
 21 | ],
 22 | [
 23 |     20, 21, 22, 23, 24,
 24 |     33, 34, 35, 36, 37,
 25 |     45, 46, 47, 48, 49, 50,
 26 |     59, 60, 61, 62, 63,
 27 |     72, 73, 74, 75, 76,
 28 |     85, 86, 87, 88, 89,
 29 |     100
 30 | ]]
 31 | 
 32 | walls = [[
 33 |     0, 1, 2, 3, 4, 5, 6,
 34 |     13, 26, 39, 52, 65,
 35 |     78, 79, 81, 82, 83,
 36 |     84, 71, 58, 32, 19,
 37 |     93
 38 | ],
 39 | [
 40 |     7, 8, 9, 10, 11, 12,
 41 |     25, 38, 51, 64, 77, 90, 103,
 42 |     98, 99, 101, 102, 103,
 43 |     97, 84, 71, 58, 32, 19, 6,
 44 |     113
 45 | ]]
 46 | 
 47 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3]
 48 | pickUps = [80, 45, 45, 100]
 49 | A = 6
 50 | T = 3000
 51 | stepNo = 0
 52 | avg_reward = np.zeros([Sx, Sy, P, G])
 53 | reward = np.zeros([P, G])
 54 | avg = np.zeros([T,1])
 55 | time_course = np.zeros([T, 3])
 56 | option = 2
 57 | option_goal = [45, 80, 100, 45]
 58 | room_no = [0, 0, 1, 1]
 59 | Q = 0.1*np.random.rand(S, O, A, P, G)
 60 | for i in xrange(S):
 61 |     for o in xrange(O):
 62 |         for a in xrange(A):
 63 |             for p in xrange(P):
 64 |                 for g in xrange(G):
 65 |                     if i not in rooms[room_no[o]]:
 66 |                         Q[i, o, a, p, g] = 0
 67 | 
 68 | V = [np.max(Q[:, o, :, :, :], axis=1) for o in xrange(O)]
 69 | eta = 0.1
 70 | gamma = 0.9
 71 | epsilon = 0.1
 72 | reward_course = np.zeros([T, 1])
 73 | reward_mean = np.zeros([T, 1])
 74 | 
 75 | stepsToGoal = np.zeros([T, 1])
 76 | maxV = -9999
 77 | for option in [0, 2]:
 78 |     for t in xrange(T):
 79 |         plocation = pickUps[option]
 80 |         pID = [i for i, x in enumerate(pickUps) if x == plocation][0]
 81 |         if pID == 1:
 82 |             pID = 2 # fix logic issue
 83 |         Goal = option_goal[option]
 84 |         p0 = plocation
 85 |         gID = [i for i, x in enumerate(pickUps) if x == Goal][0]
 86 |         s0 = np.random.choice([state for state in xrange(S) if state in rooms[room_no[option]]])
 87 |         state = [s0, pID, gID] #[{1..25} {1..5} {1..4}]
 88 |         for u in xrange(S**2):
 89 |             if (stepNo > 30):
 90 |                 stepNo = 0
 91 |                 break
 92 | 
 93 |             r = 0
 94 |             [V[option][s0, pID, gID], a0] = [np.max(Q[s0, option, :, pID, gID]), np.argmax(Q[s0, option, :, pID, gID])]
 95 |             if (np.random.rand(1) < epsilon):
 96 |                 a0 = np.random.choice(A)
 97 | 
 98 | 
 99 |             if a0 == 4:
100 |                 if pID != 4:
101 |                     if s0 == pickUps[pID]:
102 |                         r = 1
103 |                         pID = 4
104 |                         stepNo = 0
105 |                     else:
106 |                         r = -1
107 |                 else:
108 |                     r = -1
109 | 
110 | 
111 |             if a0 == 5:
112 |                 if (s0 == pickUps[gID]) and pID==4:
113 |                     stepsToGoal[t] = stepNo
114 |                     r = 10/float(stepNo)
115 |                     if maxR < r:
116 |                         maxR = r
117 | 
118 |                     stepNo = 0
119 |                 else:
120 |                     r = -1
121 | 
122 | 
123 |             if a0 == 0:
124 |                 s1 = s0 - Sx
125 |                 if s1 not in rooms[room_no[option]]:
126 |                     s1 = s1 + Sx
127 |                     r = -1
128 | 
129 | 
130 |             if a0 == 1:
131 |                 s1 = s0 + Sx
132 |                 if s1 not in rooms[room_no[option]]:
133 |                     s1 = s1 - Sx
134 |                     r = -1
135 | 
136 | 
137 |             if a0 == 2:
138 |                 s1 = s0 - 1
139 |                 if s1 not in rooms[room_no[option]]:
140 |                     s1 = s1 + 1
141 |                     r = -1
142 | 
143 | 
144 |             if a0 == 3:
145 |                 s1 = s0 + 1
146 |                 if s1 not in rooms[room_no[option]]:
147 |                     s1 = s1 - 1
148 |                     r = -1
149 | 
150 |             if a0 == 4:
151 |                 s1 = s0
152 | 
153 | 
154 |             if a0 == 5:
155 |                 s1 = s0
156 | 
157 | 
158 |             # learning step
159 |             if t > 1000:
160 |                 R += r
161 | 
162 | 
163 |             # print r
164 |             FullR = R + r
165 |             reward_course[t] = r
166 |             reward_mean[t] = R/float(t+1)
167 | 
168 | 
169 |             V[option][s1, pID, gID] = np.max(Q[s1, option, :, pID, gID])
170 | 
171 | 
172 |             if maxV < V[option][s1, pID, gID]:
173 |                 maxV = V[option][s1, pID, gID]
174 | 
175 | 
176 |             time_course[t, 0] = V[option][s1, pID, gID]
177 |             time_course[t, 1] = eta*(r+gamma*V[option][s1, pID, gID])
178 |             time_course[t, 2] = (1-eta)*Q[s0, option, a0, pID, gID]
179 |             Q[s0, option, a0, pID, gID] = (1-eta)*Q[s0, option, a0, pID, gID] + \
180 |                 eta*(r + gamma*V[option][s1, pID, gID])
181 |             if pID == 4:
182 |                 stepNo += 1
183 | 
184 | 
185 |             if (s0 == pickUps[gID]) and (a0 == 5):
186 |                 stepNo = 0
187 |                 break
188 | 
189 | 
190 |             s0 = s1
191 | 
192 | 
193 |         avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :]))))
194 | 
195 | 
196 | meanR = R/float(T-1000)
197 | fullMR = FullR/float(T)
198 | print meanR
199 | print fullMR
200 | print maxV
201 | policy = [np.max(Q[i, option, :, gID]) for i in xrange(S)]
202 | # value_0 = [V[0][state, 1] for state in xrange(S)]
203 | # value_2 = [V[1][state, 3] for state in xrange(S)]
204 | # visited_states = np.reshape(visited_states, [8, 13])
205 | policy_actions_0 = [np.argmax(Q[i, 0, :, 4, 1]) for i in xrange(S)]
206 | policy_actions_2 = [np.argmax(Q[i, 2, :, 4, 3]) for i in xrange(S)]
207 | print len(policy)
208 | policy_actions_0 = np.reshape(policy_actions_0, [8, 13])
209 | policy_actions_2 = np.reshape(policy_actions_2, [8, 13])
210 | # value_0 = np.reshape(value_0, [8, 13])
211 | # value_2 = np.reshape(value_2, [8, 13])
212 | policy = np.reshape(policy, [8, 13])
213 | 
214 | for j in xrange(8):
215 |     for i in xrange(13):
216 |         print "%d " % int(policy_actions_0[j, i]),
217 | 
218 |     print " "
219 | print "-------------------------------------"
220 | for j in xrange(8):
221 |     for i in xrange(13):
222 |         print "%d " % int(policy_actions_2[j, i]),
223 | 
224 |     print " "
225 | 


--------------------------------------------------------------------------------
/options-using-q/optionsUsing-nopid.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import math
  4 | 
  5 | 
  6 | Sx = 13
  7 | Sy = 8
  8 | S = Sx*Sy
  9 | # P = 5 # there is a state for being in the taxi
 10 | G = 4
 11 | R = 0
 12 | O = 4
 13 | maxR = -999999
 14 | hallways = [80, 45, 100]
 15 | rooms = [[
 16 |     14, 15, 16, 17, 18,
 17 |     27, 28, 29, 30, 31,
 18 |     40, 41, 42, 43, 44, 45,
 19 |     53, 54, 55, 56, 57,
 20 |     66, 67, 68, 69, 70,
 21 |     80
 22 | ],
 23 | [
 24 |     20, 21, 22, 23, 24,
 25 |     33, 34, 35, 36, 37,
 26 |     45, 46, 47, 48, 49, 50,
 27 |     59, 60, 61, 62, 63,
 28 |     72, 73, 74, 75, 76,
 29 |     85, 86, 87, 88, 89,
 30 |     100
 31 | ]]
 32 | 
 33 | walls = [[
 34 |     0, 1, 2, 3, 4, 5, 6,
 35 |     13, 26, 39, 52, 65,
 36 |     78, 79, 81, 82, 83,
 37 |     84, 71, 58, 32, 19,
 38 |     93
 39 | ],
 40 | [
 41 |     7, 8, 9, 10, 11, 12,
 42 |     25, 38, 51, 64, 77, 90, 103,
 43 |     98, 99, 101, 102, 103,
 44 |     97, 84, 71, 58, 32, 19, 6,
 45 |     113
 46 | ]]
 47 | visited_states = ['r' for _ in xrange(S)]
 48 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3]
 49 | pickUps = [80, 45, 45, 100]
 50 | A = 4#6
 51 | T = 30000
 52 | stepNo = 0
 53 | avg = np.zeros([T,1])
 54 | time_course = np.zeros([T, 3])
 55 | options_used = []
 56 | option = 0
 57 | options_used.append(option)
 58 | option_goal = [45, 80, 100, 45]
 59 | endGoal = option_goal[2]
 60 | room_no = [0, 0, 1, 1]
 61 | Q = 0.1*np.random.rand(S, O, A, G)#0.1*np.random.rand(S, O, A, P, G)
 62 | goalReached = False
 63 | for i in xrange(S):
 64 |     for o in xrange(O):
 65 |         for a in xrange(A):
 66 |             # for p in xrange(P):
 67 |             for g in xrange(G):
 68 |                 if i not in rooms[room_no[o]]:
 69 |                     # Q[i, o, a, p, g] = 0
 70 |                     Q[i, o, a, g] = 0
 71 | 
 72 | V = [np.max(Q[:, o, :], axis=2) for o in xrange(O)]
 73 | eta = 0.1
 74 | gamma = 0.9
 75 | epsilon = 0.1
 76 | reward_course = np.zeros([T, 1])
 77 | reward_mean = np.zeros([T, 1])
 78 | 
 79 | stepsToGoal = np.zeros([T, 1])
 80 | maxV = -9999
 81 | switched = False
 82 | u=0
 83 | for t in xrange(T):
 84 |     Goal = option_goal[option]
 85 |     gID = [i for i, x in enumerate(pickUps) if x == Goal][0]
 86 |     s0 = np.random.choice([state for state in xrange(S) if state in rooms[room_no[option]] and state != option_goal[option]])
 87 | 
 88 |     state = [s0, gID] #[s0, pID, gID] #[{1..25} {1..5} {1..4}]
 89 |     for u in xrange(S**2):
 90 |         # print s0
 91 |         visited_states[s0] = 'g'
 92 | 
 93 |         r = 0
 94 |         [V[option][s0, gID], a0] = [np.max(Q[s0, option, :, gID]), np.argmax(Q[s0, option, :, gID])]
 95 |         if (np.random.rand(1) < epsilon):
 96 |             a0 = np.random.choice(A)
 97 | 
 98 |         if (s0 == pickUps[gID]):
 99 |             stepsToGoal[t] = stepNo
100 |             if s0 == endGoal:
101 |                 r = 1
102 |                 goalReached = True
103 |             else:
104 |                 r = 1
105 |                 goalReached = True
106 | 
107 |             if maxR < r:
108 |                 maxR = r
109 | 
110 |             stepNo = 0
111 | 
112 |         if a0 == 0 and not goalReached:
113 |             s1 = s0 - Sx
114 |             if s1 not in rooms[room_no[option]]:
115 |                 s1 = s1 + Sx
116 |                 # r = -1
117 | 
118 | 
119 |         if a0 == 1 and not goalReached:
120 |             s1 = s0 + Sx
121 |             if s1 not in rooms[room_no[option]]:
122 |                 s1 = s1 - Sx
123 |                 # r = -1
124 | 
125 | 
126 |         if a0 == 2 and not goalReached:
127 |             s1 = s0 - 1
128 |             if s1 not in rooms[room_no[option]]:
129 |                 s1 = s1 + 1
130 |                 # r = -1
131 | 
132 | 
133 |         if a0 == 3 and not goalReached:
134 |             s1 = s0 + 1
135 |             if s1 not in rooms[room_no[option]]:
136 |                 s1 = s1 - 1
137 |                 # r = -1
138 | 
139 |         # learning step
140 |         if t > 100:
141 |             R += r
142 | 
143 |         # print r
144 |         FullR = R + r
145 |         reward_course[t] = r
146 |         reward_mean[t] = R/float(t+1)
147 | 
148 |         V[option][s1, gID] = np.max(Q[s1, option, :, gID])
149 | 
150 |         if maxV < V[option][s1, gID]:
151 |             maxV = V[option][s1, gID]
152 | 
153 |         time_course[t, 0] = V[option][s1, gID]
154 |         time_course[t, 1] = eta*(r+gamma*V[option][s1, gID])
155 |         time_course[t, 2] = (1-eta)*Q[s0, option, a0, gID]
156 |         Q[s0, option, a0, gID] = (1-eta)*Q[s0, option, a0, gID] + \
157 |                     eta*(r + gamma*V[option][s1, gID])
158 | 
159 |         stepNo += 1
160 |         if (s0 == endGoal):
161 |             stepNo = 0
162 |             option = 0
163 |             switched = False
164 |             goalReached = False
165 |             break
166 | 
167 |         if (s0 == pickUps[gID]) and s0 != endGoal:
168 |             stepNo = 0
169 |             option = 2
170 |             switched = True
171 |             goalReached = False
172 |             options_used.append(option)
173 |             break
174 | 
175 |         s0 = s1
176 | 
177 |     avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :]))))
178 | 
179 | 
180 | meanR = R/float(T-1000)
181 | fullMR = FullR/float(T)
182 | print meanR
183 | print fullMR
184 | print maxV
185 | # policy = [np.max(Q[i, option, :, pID, gID]) for i in xrange(S)]
186 | # policy_actions_0 = [np.argmax(Q[i, 0, :, pID, gID]) for i in xrange(S)]
187 | # policy_actions_2 = [np.argmax(Q[i, 2, :, pID, gID]) for i in xrange(S)]
188 | policy = [np.max(Q[i, option, :, gID]) for i in xrange(S)]
189 | value_0 = [V[0][state, 1] for state in xrange(S)]
190 | value_2 = [V[1][state, 3] for state in xrange(S)]
191 | visited_states = np.reshape(visited_states, [8, 13])
192 | policy_actions_0 = [np.argmax(Q[i, 0, :, 1]) for i in xrange(S)]
193 | policy_actions_2 = [np.argmax(Q[i, 2, :, 3]) for i in xrange(S)]
194 | print len(policy)
195 | policy_actions_0 = np.reshape(policy_actions_0, [8, 13])
196 | policy_actions_2 = np.reshape(policy_actions_2, [8, 13])
197 | value_0 = np.reshape(value_0, [8, 13])
198 | value_2 = np.reshape(value_2, [8, 13])
199 | policy = np.reshape(policy, [8, 13])
200 | 
201 | for j in xrange(8):
202 |     for i in xrange(13):
203 |         print "%d " % int(policy_actions_0[j, i]),
204 | 
205 |     print " "
206 | print "-------------------------------------"
207 | for j in xrange(8):
208 |     for i in xrange(13):
209 |         print "%d " % int(policy_actions_2[j, i]),
210 | 
211 |     print " "
212 | print "-------------------------------------"
213 | for j in xrange(8):
214 |     for i in xrange(13):
215 |         print "%s " % visited_states[j, i],
216 | 
217 |     print " "
218 | # for j in xrange(8):
219 | #     for i in xrange(13):
220 | #         print "{0} ".format(int(policy[j, i])),
221 | 
222 | #     print " "


--------------------------------------------------------------------------------
/options-using-q/basicOption-tworooms.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | 
  5 | Sx = 13
  6 | Sy = 8
  7 | S = Sx*Sy
  8 | P = 5 # there is a state for being in the taxi
  9 | G = 4
 10 | R = 0
 11 | O = 4
 12 | maxR = -999999
 13 | hallways = [80, 45, 100]
 14 | rooms = [[
 15 |     14, 15, 16, 17, 18,
 16 |     27, 28, 29, 30, 31,
 17 |     40, 41, 42, 43, 44, 45,
 18 |     53, 54, 55, 56, 57,
 19 |     66, 67, 68, 69, 70,
 20 |     80
 21 | ],
 22 | [
 23 |     20, 21, 22, 23, 24,
 24 |     33, 34, 35, 36, 37,
 25 |     45, 46, 47, 48, 49, 50,
 26 |     59, 60, 61, 62, 63,
 27 |     72, 73, 74, 75, 76,
 28 |     85, 86, 87, 88, 89,
 29 |     100
 30 | ]]
 31 | 
 32 | walls = [[
 33 |     0, 1, 2, 3, 4, 5, 6,
 34 |     13, 26, 39, 52, 65,
 35 |     78, 79, 81, 82, 83,
 36 |     84, 71, 58, 32, 19,
 37 |     93
 38 | ],
 39 | [
 40 |     7, 8, 9, 10, 11, 12,
 41 |     25, 38, 51, 64, 77, 90, 103,
 42 |     98, 99, 101, 102, 103,
 43 |     97, 84, 71, 58, 32, 19, 6,
 44 |     113
 45 | ]]
 46 | 
 47 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3]
 48 | pickUps = [80, 45, 45, 100]
 49 | A = 6
 50 | T = 30000
 51 | stepNo = 0
 52 | avg_reward = np.zeros([Sx, Sy, P, G])
 53 | reward = np.zeros([P, G])
 54 | avg = np.zeros([T,1])
 55 | time_course = np.zeros([T, 3])
 56 | options_used = []
 57 | option = 0
 58 | options_used.append(option)
 59 | option_goal = [45, 80, 100, 45]
 60 | endGoal = option_goal[2]
 61 | room_no = [0, 0, 1, 1]
 62 | Q = 0.1*np.random.rand(S, O, A, P, G)
 63 | for i in xrange(S):
 64 |     for o in xrange(O):
 65 |         for a in xrange(A):
 66 |             for p in xrange(P):
 67 |                 for g in xrange(G):
 68 |                     if i not in rooms[room_no[o]]:
 69 |                         Q[i, o, a, p, g] = 0
 70 | 
 71 | V = [np.max(Q[:, o, :], axis=2) for o in xrange(O)]
 72 | eta = 0.1
 73 | gamma = 0.9
 74 | epsilon = 0.1
 75 | reward_course = np.zeros([T, 1])
 76 | reward_mean = np.zeros([T, 1])
 77 | 
 78 | stepsToGoal = np.zeros([T, 1])
 79 | maxV = -9999
 80 | switched = False
 81 | for t in xrange(T):
 82 |     Goal = option_goal[option]
 83 |     gID = [i for i, x in enumerate(pickUps) if x == Goal][0]
 84 |     if not switched:
 85 |         plocation = pickUps[option]
 86 |         p0 = plocation
 87 |         pID = [i for i, x in enumerate(pickUps) if x == plocation][0]
 88 |         s0 = np.random.choice([state for state in xrange(S) if state in rooms[room_no[option]]])
 89 | 
 90 |     state = [s0, pID, gID] #[{1..25} {1..5} {1..4}]
 91 |     for u in xrange(S**2):
 92 | 
 93 |         if (stepNo > 30):
 94 |             stepNo = 0
 95 |             break
 96 | 
 97 |         r = 0
 98 |         [V[option][s0, pID, gID], a0] = [np.max(Q[s0, option, :, pID, gID]), np.argmax(Q[s0, option, :, pID, gID])]
 99 |         if (np.random.rand(1) < epsilon):
100 |             a0 = np.random.choice(A)
101 | 
102 |         if a0 == 4:
103 |             if pID != 4:
104 |                 if s0 == pickUps[pID]:
105 |                     r = 1
106 |                     pID = 4
107 |                     stepNo = 0
108 |                 else:
109 |                     r = -1
110 |             else:
111 |                 r = -1
112 | 
113 | 
114 |         if a0 == 5:
115 |             if (s0 == pickUps[gID]) and pID==4:
116 |                 stepsToGoal[t] = stepNo
117 |                 if stepNo > 0:
118 |                     r = 10
119 |                 else:
120 |                     r = 10
121 |                 if maxR < r:
122 |                     maxR = r
123 | 
124 |                 stepNo = 0
125 |             else:
126 |                 r = -1
127 | 
128 | 
129 |         if a0 == 0:
130 |             s1 = s0 - Sx
131 |             if s1 not in rooms[room_no[option]]:
132 |                 s1 = s1 + Sx
133 |                 r = -1
134 | 
135 | 
136 |         if a0 == 1:
137 |             s1 = s0 + Sx
138 |             if s1 not in rooms[room_no[option]]:
139 |                 s1 = s1 - Sx
140 |                 r = -1
141 | 
142 | 
143 |         if a0 == 2:
144 |             s1 = s0 - 1
145 |             if s1 not in rooms[room_no[option]]:
146 |                 s1 = s1 + 1
147 |                 r = -1
148 | 
149 | 
150 |         if a0 == 3:
151 |             s1 = s0 + 1
152 |             if s1 not in rooms[room_no[option]]:
153 |                 s1 = s1 - 1
154 |                 r = -1
155 | 
156 |         if a0 == 4:
157 |             s1 = s0
158 | 
159 | 
160 |         if a0 == 5:
161 |             s1 = s0
162 | 
163 | 
164 |         # learning step
165 |         if t > 1000:
166 |             R += r
167 | 
168 | 
169 |         # print r
170 |         FullR = R + r
171 |         reward_course[t] = r
172 |         reward_mean[t] = R/float(t+1)
173 | 
174 | 
175 |         V[option][s1, pID, gID] = np.max(Q[s1, option, :, pID, gID])
176 | 
177 | 
178 |         if maxV < V[option][s1, pID, gID]:
179 |             maxV = V[option][s1, pID, gID]
180 | 
181 | 
182 |         time_course[t, 0] = V[option][s1, pID, gID]
183 |         time_course[t, 1] = eta*(r+gamma*V[option][s1, pID, gID])
184 |         time_course[t, 2] = (1-eta)*Q[s0, option, a0, pID, gID]
185 |         Q[s0, option, a0, pID, gID] = (1-eta)*Q[s0, option, a0, pID, gID] + \
186 |             eta*(r + gamma*V[option][s1, pID, gID])
187 |         if pID == 4:
188 |             stepNo += 1
189 | 
190 |         if (s0 == endGoal) and (a0 == 5) and (pID == 4):
191 |             stepNo = 0
192 |             option = 0
193 |             switched = False
194 |             # print "Final Goal achieved!!"
195 |             break
196 | 
197 |         if (s0 == pickUps[gID]) and (a0 == 5) and (pID == 4) and s0 != endGoal:
198 |             stepNo = 0
199 |             # print "---> ",
200 |             # print option
201 |             option = 2
202 |             switched = True
203 |             options_used.append(option)
204 |             break
205 | 
206 |         s0 = s1
207 | 
208 | 
209 |     avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :]))))
210 | 
211 | 
212 | meanR = R/float(T-1000)
213 | fullMR = FullR/float(T)
214 | print meanR
215 | print fullMR
216 | print maxV
217 | policy = [np.max(Q[i, option, :, pID, gID]) for i in xrange(S)]
218 | policy_actions_0 = [np.argmax(Q[i, 0, :, pID, gID]) for i in xrange(S)]
219 | policy_actions_2 = [np.argmax(Q[i, 2, :, pID, gID]) for i in xrange(S)]
220 | print len(policy)
221 | policy_actions_0 = np.reshape(policy_actions_0, [8, 13])
222 | policy_actions_2 = np.reshape(policy_actions_2, [8, 13])
223 | policy = np.reshape(policy, [8, 13])
224 | 
225 | for j in xrange(8):
226 |     for i in xrange(13):
227 |         print "{0} ".format(policy_actions_0[j, i]),
228 | 
229 |     print " "
230 | print "-------------------------------------"
231 | for j in xrange(8):
232 |     for i in xrange(13):
233 |         print "{0} ".format(policy_actions_2[j, i]),
234 | 
235 |     print " "
236 | # for j in xrange(8):
237 | #     for i in xrange(13):
238 | #         print "{0} ".format(int(policy[j, i])),
239 | 
240 | #     print " "


--------------------------------------------------------------------------------
/hierarchicalrl/optionsUsing-nopid.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import math
  4 | 
  5 | 
  6 | Sx = 13
  7 | Sy = 8
  8 | S = Sx*Sy
  9 | # P = 5 # there is a state for being in the taxi
 10 | G = 4
 11 | R = 0
 12 | O = 8
 13 | maxR = -999999
 14 | hallways = [80, 45, 100, 136]
 15 | rooms = [[
 16 |     14, 15, 16, 17, 18,
 17 |     27, 28, 29, 30, 31,
 18 |     40, 41, 42, 43, 44, 45,
 19 |     53, 54, 55, 56, 57,
 20 |     66, 67, 68, 69, 70,
 21 |     80
 22 | ],
 23 | [
 24 |     20, 21, 22, 23, 24,
 25 |     33, 34, 35, 36, 37,
 26 |     45, 46, 47, 48, 49, 50,
 27 |     59, 60, 61, 62, 63,
 28 |     72, 73, 74, 75, 76,
 29 |     85, 86, 87, 88, 89,
 30 |     100
 31 | ],
 32 | [
 33 |     111, 112, 113, 114, 115,
 34 |     124, 125, 126, 127, 128,
 35 |     137, 138, 139, 140, 141,
 36 |     150, 151, 152, 153, 154,
 37 |     100, 136
 38 | ],
 39 | [
 40 |     92, 93, 94, 95, 96,
 41 |     105, 106, 107, 108, 109,
 42 |     118, 119, 120, 121, 122,
 43 |     131, 132, 133, 134, 135,
 44 |     144, 145, 146, 147, 148,
 45 |     80, 136
 46 | ]]
 47 | 
 48 | walls = [[
 49 |     0, 1, 2, 3, 4, 5, 6,
 50 |     13, 26, 39, 52, 65,
 51 |     78, 79, 81, 82, 83,
 52 |     84, 71, 58, 32, 19,
 53 |     93
 54 | ],
 55 | [
 56 |     7, 8, 9, 10, 11, 12,
 57 |     25, 38, 51, 64, 77, 90, 103,
 58 |     98, 99, 101, 102, 103,
 59 |     97, 84, 71, 58, 32, 19, 6,
 60 |     113
 61 | ],
 62 | [
 63 |     103, 116, 129, 142, 155, 168, 97,
 64 |     110, 123, 149,
 65 |     162, 98, 99, 101, 102,
 66 |     163, 164, 165, 166, 167
 67 | ],
 68 | [
 69 |     78,
 70 |     91,
 71 |     104,
 72 |     117,
 73 |     130,
 74 |     143,
 75 |     156,
 76 |     84,
 77 |     97,
 78 |     110,
 79 |     123,
 80 |     149, 157, 158, 159, 160, 161,
 81 |     162, 79, 81, 82, 83
 82 | ]]
 83 | visited_states = ['r' for _ in xrange(S)]
 84 | # init_states = [0, 0, 1, 1, 2, 2, 3, 3]
 85 | pickUps = [80, 45, 100, 136]
 86 | A = 4#6
 87 | T = 30000
 88 | stepNo = 0
 89 | avg = np.zeros([T,1])
 90 | time_course = np.zeros([T, 3])
 91 | options_used = []
 92 | option = 3
 93 | options_used.append(option)
 94 | option_goal = [45, 80, 100, 136]
 95 | endGoal = option_goal[option]#[2]
 96 | room_no = [0, 0, 1, 1, 2, 2, 3, 3]
 97 | Q = 0.1*np.random.rand(S, O, A, G)#0.1*np.random.rand(S, O, A, P, G)
 98 | goalReached = False
 99 | for i in xrange(S):
100 |     for o in xrange(O):
101 |         for a in xrange(A):
102 |             # for p in xrange(P):
103 |             for g in xrange(G):
104 |                 if i not in rooms[room_no[o]]:
105 |                     # Q[i, o, a, p, g] = 0
106 |                     Q[i, o, a, g] = 0
107 | 
108 | V = [np.max(Q[:, o, :], axis=2) for o in xrange(O)]
109 | eta = 0.1
110 | gamma = 0.9
111 | epsilon = 0.1
112 | reward_course = np.zeros([T, 1])
113 | reward_mean = np.zeros([T, 1])
114 | 
115 | stepsToGoal = np.zeros([T, 1])
116 | maxV = -9999
117 | switched = False
118 | u=0
119 | for t in xrange(T):
120 |     Goal = option_goal[option]
121 |     gID = [i for i, x in enumerate(pickUps) if x == Goal][0]
122 |     s0 = np.random.choice([state for state in xrange(S) if state in rooms[room_no[option]] and state != option_goal[option]])
123 | 
124 |     state = [s0, gID] #[s0, pID, gID] #[{1..25} {1..5} {1..4}]
125 |     for u in xrange(S**2):
126 |         # print s0
127 |         visited_states[s0] = 'g'
128 | 
129 |         r = 0
130 |         [V[option][s0, gID], a0] = [np.max(Q[s0, option, :, gID]), np.argmax(Q[s0, option, :, gID])]
131 |         if (np.random.rand(1) < epsilon):
132 |             a0 = np.random.choice(A)
133 | 
134 |         if (s0 == pickUps[gID]):
135 |             stepsToGoal[t] = stepNo
136 |             if s0 == endGoal:
137 |                 r = 1
138 |                 goalReached = True
139 |             else:
140 |                 r = 1
141 |                 goalReached = True
142 | 
143 |             if maxR < r:
144 |                 maxR = r
145 | 
146 |             stepNo = 0
147 | 
148 |         if a0 == 0 and not goalReached:
149 |             s1 = s0 - Sx
150 |             if s1 not in rooms[room_no[option]]:
151 |                 s1 = s1 + Sx
152 |                 # r = -1
153 | 
154 | 
155 |         if a0 == 1 and not goalReached:
156 |             s1 = s0 + Sx
157 |             if s1 not in rooms[room_no[option]]:
158 |                 s1 = s1 - Sx
159 |                 # r = -1
160 | 
161 | 
162 |         if a0 == 2 and not goalReached:
163 |             s1 = s0 - 1
164 |             if s1 not in rooms[room_no[option]]:
165 |                 s1 = s1 + 1
166 |                 # r = -1
167 | 
168 | 
169 |         if a0 == 3 and not goalReached:
170 |             s1 = s0 + 1
171 |             if s1 not in rooms[room_no[option]]:
172 |                 s1 = s1 - 1
173 |                 # r = -1
174 | 
175 |         # learning step
176 |         if t > 100:
177 |             R += r
178 | 
179 |         # print r
180 |         FullR = R + r
181 |         reward_course[t] = r
182 |         reward_mean[t] = R/float(t+1)
183 | 
184 |         V[option][s1, gID] = np.max(Q[s1, option, :, gID])
185 | 
186 |         if maxV < V[option][s1, gID]:
187 |             maxV = V[option][s1, gID]
188 | 
189 |         time_course[t, 0] = V[option][s1, gID]
190 |         time_course[t, 1] = eta*(r+gamma*V[option][s1, gID])
191 |         time_course[t, 2] = (1-eta)*Q[s0, option, a0, gID]
192 |         Q[s0, option, a0, gID] = (1-eta)*Q[s0, option, a0, gID] + \
193 |                     eta*(r + gamma*V[option][s1, gID])
194 | 
195 |         stepNo += 1
196 |         if (s0 == endGoal):
197 |             # print "tuk sam"
198 |             stepNo = 0
199 |             # option = 2
200 |             switched = False
201 |             goalReached = False
202 |             break
203 | 
204 |         # if (s0 == pickUps[gID]) and s0 != endGoal:
205 |         #     stepNo = 0
206 |         #     option = 0#2
207 |         #     switched = True
208 |         #     goalReached = False
209 |         #     options_used.append(option)
210 |         #     break
211 | 
212 |         s0 = s1
213 | 
214 |     avg[t] = np.mean(np.mean(np.mean(np.mean(Q[:, option, :]))))
215 | 
216 | 
217 | meanR = R/float(T-1000)
218 | fullMR = FullR/float(T)
219 | print meanR
220 | print fullMR
221 | print maxV
222 | # policy = [np.max(Q[i, option, :, pID, gID]) for i in xrange(S)]
223 | # policy_actions_0 = [np.argmax(Q[i, 0, :, pID, gID]) for i in xrange(S)]
224 | # policy_actions_2 = [np.argmax(Q[i, 2, :, pID, gID]) for i in xrange(S)]
225 | policy = [np.max(Q[i, option, :, gID]) for i in xrange(S)]
226 | value_0 = [V[0][state, 1] for state in xrange(S)]
227 | value_2 = [V[1][state, 3] for state in xrange(S)]
228 | visited_states = np.reshape(visited_states, [8, 13])
229 | policy_actions_0 = [np.argmax(Q[i, 2, :, 0]) for i in xrange(S)]
230 | policy_actions_2 = [np.argmax(Q[i, option, :, gID]) for i in xrange(S)]
231 | print len(policy)
232 | policy_actions_0 = np.reshape(policy_actions_0, [8, 13])
233 | policy_actions_2 = np.reshape(policy_actions_2, [8, 13])
234 | value_0 = np.reshape(value_0, [8, 13])
235 | value_2 = np.reshape(value_2, [8, 13])
236 | policy = np.reshape(policy, [8, 13])
237 | 
238 | for j in xrange(8):
239 |     for i in xrange(13):
240 |         print "%d " % int(policy_actions_0[j, i]),
241 | 
242 |     print " "
243 | print "-------------------------------------"
244 | for j in xrange(8):
245 |     for i in xrange(13):
246 |         print "%d " % int(policy_actions_2[j, i]),
247 | 
248 |     print " "
249 | print "-------------------------------------"
250 | for j in xrange(8):
251 |     for i in xrange(13):
252 |         print "%s " % visited_states[j, i],
253 | 
254 |     print " "
255 | # for j in xrange(8):
256 | #     for i in xrange(13):
257 | #         print "{0} ".format(int(policy[j, i])),
258 | 
259 | #     print " "


--------------------------------------------------------------------------------
/irl/mdp/objectworld.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements the objectworld MDP described in Levine et al. 2011.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | import math
  9 | from itertools import product
 10 | 
 11 | import numpy as np
 12 | import numpy.random as rn
 13 | 
 14 | from irl.mdp.gridworld import Gridworld
 15 | 
 16 | class OWObject(object):
 17 |     """
 18 |     Object in objectworld.
 19 |     """
 20 | 
 21 |     def __init__(self, inner_colour, outer_colour):
 22 |         """
 23 |         inner_colour: Inner colour of object. int.
 24 |         outer_colour: Outer colour of object. int.
 25 |         -> OWObject
 26 |         """
 27 | 
 28 |         self.inner_colour = inner_colour
 29 |         self.outer_colour = outer_colour
 30 | 
 31 |     def __str__(self):
 32 |         """
 33 |         A string representation of this object.
 34 | 
 35 |         -> __str__
 36 |         """
 37 | 
 38 |         return "<OWObject (In: {}) (Out: {})>".format(self.inner_colour,
 39 |                                                       self.outer_colour)
 40 | 
 41 | class Objectworld(Gridworld):
 42 |     """
 43 |     Objectworld MDP.
 44 |     """
 45 | 
 46 |     def __init__(self, grid_size, n_objects, n_colours, wind, discount):
 47 |         """
 48 |         grid_size: Grid size. int.
 49 |         n_objects: Number of objects in the world. int.
 50 |         n_colours: Number of colours to colour objects with. int.
 51 |         wind: Chance of moving randomly. float.
 52 |         discount: MDP discount. float.
 53 |         -> Objectworld
 54 |         """
 55 | 
 56 |         super(Objectworld, self).__init__(grid_size, wind, discount)
 57 | 
 58 |         self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1), (0, 0))
 59 |         self.n_actions = len(self.actions)
 60 |         self.n_objects = n_objects
 61 |         self.n_colours = n_colours
 62 | 
 63 |         # Generate objects.
 64 |         self.objects = {}
 65 |         for _ in range(self.n_objects):
 66 |             obj = OWObject(rn.randint(self.n_colours),
 67 |                            rn.randint(self.n_colours))
 68 | 
 69 |             while True:
 70 |                 x = rn.randint(self.grid_size)
 71 |                 y = rn.randint(self.grid_size)
 72 | 
 73 |                 if (x, y) not in self.objects:
 74 |                     break
 75 | 
 76 |             self.objects[x, y] = obj
 77 | 
 78 |         # Preconstruct the transition probability array.
 79 |         self.transition_probability = np.array(
 80 |             [[[self._transition_probability(i, j, k)
 81 |                for k in range(self.n_states)]
 82 |               for j in range(self.n_actions)]
 83 |              for i in range(self.n_states)])
 84 | 
 85 |     def feature_vector(self, i, discrete=True):
 86 |         """
 87 |         Get the feature vector associated with a state integer.
 88 | 
 89 |         i: State int.
 90 |         discrete: Whether the feature vectors should be discrete (default True).
 91 |             bool.
 92 |         -> Feature vector.
 93 |         """
 94 | 
 95 |         sx, sy = self.int_to_point(i)
 96 | 
 97 |         nearest_inner = {}  # colour: distance
 98 |         nearest_outer = {}  # colour: distance
 99 | 
100 |         for y in range(self.grid_size):
101 |             for x in range(self.grid_size):
102 |                 if (x, y) in self.objects:
103 |                     dist = math.hypot((x - sx), (y - sy))
104 |                     obj = self.objects[x, y]
105 |                     if obj.inner_colour in nearest_inner:
106 |                         if dist < nearest_inner[obj.inner_colour]:
107 |                             nearest_inner[obj.inner_colour] = dist
108 |                     else:
109 |                         nearest_inner[obj.inner_colour] = dist
110 |                     if obj.outer_colour in nearest_outer:
111 |                         if dist < nearest_outer[obj.outer_colour]:
112 |                             nearest_outer[obj.outer_colour] = dist
113 |                     else:
114 |                         nearest_outer[obj.outer_colour] = dist
115 | 
116 |         # Need to ensure that all colours are represented.
117 |         for c in range(self.n_colours):
118 |             if c not in nearest_inner:
119 |                 nearest_inner[c] = 0
120 |             if c not in nearest_outer:
121 |                 nearest_outer[c] = 0
122 | 
123 |         if discrete:
124 |             state = np.zeros((2*self.n_colours*self.grid_size,))
125 |             i = 0
126 |             for c in range(self.n_colours):
127 |                 for d in range(1, self.grid_size+1):
128 |                     if nearest_inner[c] < d:
129 |                         state[i] = 1
130 |                     i += 1
131 |                     if nearest_outer[c] < d:
132 |                         state[i] = 1
133 |                     i += 1
134 |             assert i == 2*self.n_colours*self.grid_size
135 |             assert (state >= 0).all()
136 |         else:
137 |             # Continuous features.
138 |             state = np.zeros((2*self.n_colours))
139 |             i = 0
140 |             for c in range(self.n_colours):
141 |                 state[i] = nearest_inner[c]
142 |                 i += 1
143 |                 state[i] = nearest_outer[c]
144 |                 i += 1
145 | 
146 |         return state
147 | 
148 |     def feature_matrix(self, discrete=True):
149 |         """
150 |         Get the feature matrix for this objectworld.
151 | 
152 |         discrete: Whether the feature vectors should be discrete (default True).
153 |             bool.
154 |         -> NumPy array with shape (n_states, n_states).
155 |         """
156 | 
157 |         return np.array([self.feature_vector(i, discrete)
158 |                          for i in range(self.n_states)])
159 | 
160 |     def reward(self, state_int):
161 |         """
162 |         Get the reward for a state int.
163 | 
164 |         state_int: State int.
165 |         -> reward float
166 |         """
167 | 
168 |         x, y = self.int_to_point(state_int)
169 | 
170 |         near_c0 = False
171 |         near_c1 = False
172 |         for (dx, dy) in product(range(-3, 4), range(-3, 4)):
173 |             if 0 <= x + dx < self.grid_size and 0 <= y + dy < self.grid_size:
174 |                 if (abs(dx) + abs(dy) <= 3 and
175 |                         (x+dx, y+dy) in self.objects and
176 |                         self.objects[x+dx, y+dy].outer_colour == 0):
177 |                     near_c0 = True
178 |                 if (abs(dx) + abs(dy) <= 2 and
179 |                         (x+dx, y+dy) in self.objects and
180 |                         self.objects[x+dx, y+dy].outer_colour == 1):
181 |                     near_c1 = True
182 | 
183 |         if near_c0 and near_c1:
184 |             return 1
185 |         if near_c0:
186 |             return -1
187 |         return 0
188 | 
189 |     def generate_trajectories(self, n_trajectories, trajectory_length, policy):
190 |         """
191 |         Generate n_trajectories trajectories with length trajectory_length.
192 | 
193 |         n_trajectories: Number of trajectories. int.
194 |         trajectory_length: Length of an episode. int.
195 |         policy: Map from state integers to action integers.
196 |         -> [[(state int, action int, reward float)]]
197 |         """
198 | 
199 |         return super(Objectworld, self).generate_trajectories(n_trajectories, trajectory_length,
200 |                                              policy,
201 |                                              True)
202 | 
203 |     def optimal_policy(self, state_int):
204 |         raise NotImplementedError(
205 |             "Optimal policy is not implemented for Objectworld.")
206 |     def optimal_policy_deterministic(self, state_int):
207 |         raise NotImplementedError(
208 |             "Optimal policy is not implemented for Objectworld.")
209 | 


--------------------------------------------------------------------------------
/options-using-q/options-temp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | 
  5 | Sx = 5
  6 | Sy = 5
  7 | S = Sx*Sy
  8 | P = 5 # there is a state for being in the taxi
  9 | G = 4
 10 | R = 0
 11 | maxR = -999999
 12 | maxRo = -999999
 13 | pickUps = [0, Sx-1, S-Sx, S-2]
 14 | O = 7
 15 | A = 6
 16 | T = 100000
 17 | k_step = 0
 18 | stepNo = 0
 19 | stepNo_o = 0
 20 | avg_reward = np.zeros([Sx, Sy, P, G])
 21 | reward = np.zeros([P, G])
 22 | avg = np.zeros([T,1])
 23 | time_course = np.zeros([T, 3])
 24 | Q = 0.1*np.random.rand(S, O, P, G)
 25 | opt_one_policy = 0.1*np.random.rand(S, A, P, G)
 26 | V = np.max(Q, axis=1)
 27 | V_o = np.max(opt_one_policy, axis=1)
 28 | eta = 0.1
 29 | gamma = 0.9
 30 | epsilon = 0.3
 31 | reward_course = np.zeros([T, 1])
 32 | reward_mean = np.zeros([T, 1])
 33 | shouldBreak = False
 34 | 
 35 | stop = True
 36 | stepsToGoal = np.zeros([T, 1])
 37 | maxV = -9999
 38 | maxV_o = -9999
 39 | for t in xrange(T):
 40 |     plocation = 20
 41 |     pID = [i for i, x in enumerate(pickUps) if x == plocation][0]
 42 |     Goal = 24
 43 |     p0 = plocation + 1
 44 |     gID = [i for i, x in enumerate(pickUps) if x == Goal-1][0]
 45 |     s0 = np.random.choice(S)
 46 |     state = [s0, pID, gID] #[{1..25} {1..5} {1..4}]
 47 |     for u in xrange(S**4):
 48 |         if (stepNo > 30):
 49 |             stepNo = 0
 50 |             stepNo_o = 0
 51 |             break
 52 | 
 53 |         r = 0
 54 |         if stop: #V[s1, pID, gID] = np.max(Q[s1, :, pID, gID])
 55 |             # if o0 == 6: # if previous time had the option and now time's gone
 56 |             #     V[s1, pID, gID] = np.max(Q[s1, :, pID, gID])
 57 |             [V[s0, pID, gID], o0] = [np.max(Q[s0, :, pID, gID]), np.argmax(Q[s0, :, pID, gID])]
 58 | 
 59 |             if (np.random.rand(1) < epsilon):
 60 |                 o0 = np.random.choice(O)
 61 | 
 62 |             if o0 == 6:
 63 |                 r_o = 0
 64 |                 k_step = 1
 65 |                 stop = False
 66 |                 [V_o[s0, pID, gID], a0] = [np.max(opt_one_policy[s0, :, pID, gID]),
 67 |                                            np.argmax(opt_one_policy[s0, :, pID, gID])]
 68 |         else:
 69 |             k_step += 1
 70 |             [V_o[s0, pID, gID], a0] = [
 71 |                 np.max(opt_one_policy[s0, :, pID, gID]), np.argmax(opt_one_policy[s0, :, pID, gID])]
 72 | 
 73 |         if o0 == 4:
 74 |             if pID != 4:
 75 |                 if s0 == pickUps[pID]:
 76 |                     r = 1
 77 |                     pID = 4
 78 |                     stepNo = 0
 79 |                 else:
 80 |                     r = -1
 81 |             else:
 82 |                 r = -1
 83 | 
 84 |         if o0 == 5:
 85 |             if (s0 == pickUps[gID]) and pID==4:
 86 |                 stepsToGoal[t] = stepNo
 87 |                 r = 10/float(stepNo)
 88 |                 if maxR < r:
 89 |                     maxR = r
 90 | 
 91 |                 stepNo = 0
 92 |             else:
 93 |                 r = -1
 94 | 
 95 |         if o0 == 0:
 96 |             s1 = s0 - Sx
 97 |             if s1 < 0:
 98 |                 s1 = s1 + Sx
 99 |                 r = -1
100 | 
101 |         if o0 == 1:
102 |             s1 = s0 + Sx
103 |             if s1 > 24:
104 |                 s1 = s1 - Sx
105 |                 r = -1
106 | 
107 |         if o0 == 2:
108 |             s1=s0-1
109 |             if s1==-1 or s1==4 or s1==9 or s1==14 or s1==19:
110 |                 s1=s1+1
111 |                 r = -1
112 | 
113 |             if s1==1 or s1==6 or s1==20 or s1==15 or s1==17 or s1==22:
114 |                 s1 = s1+1
115 |                 r = -1
116 | 
117 |         if o0 == 3:
118 |             s1 = s0 + 1
119 |             if s1 == 5 or s1 == 10 or s1 == 15 or s1 == 20 or s1==25:
120 |                 s1 = s1 - 1
121 |                 r = -1
122 | 
123 |             if s1 == 2 or s1 == 7 or s1 == 21 or s1 == 16 or s1 == 18 or s1 == 23:
124 |                 s1 = s1 - 1
125 |                 r = -1
126 | 
127 |         if o0 == 6:
128 |             if pID != 4:
129 |                 s1 = s0
130 |                 stop = True
131 |             else:
132 |                 stepNo_o += 1
133 |                 if a0 == 4:
134 |                     r_o = -1
135 | 
136 |                 if a0 == 5:
137 |                     if (s0 == 16): # 19 is a random state I would like my option to get to
138 |                         # stepsToGoal[t] = stepNo
139 |                         r_o = 1
140 |                         if maxRo < r_o:
141 |                             maxRo = r_o
142 | 
143 |                         stop = True
144 |                         stepNo_o = 0
145 |                     # elif (s0 == pickUps[gID]):
146 |                     #     # stepsToGoal[t] = stepNo
147 |                     #     r_o = 10/float(stepNo)
148 |                     #     if maxR_o < r_o:
149 |                     #         maxR_o = r_o
150 | 
151 |                     #     stop = True
152 |                     #     stepNo = 0
153 |                     else:
154 |                         r_o = -1
155 | 
156 |                 if a0 == 0:
157 |                     s1 = s0 - Sx
158 |                     if s1 < 0:
159 |                         s1 = s1 + Sx
160 |                         r_o = -1
161 | 
162 |                 if a0 == 1:
163 |                     s1 = s0 + Sx
164 |                     if s1 > 24:
165 |                         s1 = s1 - Sx
166 |                         r_o = -1
167 | 
168 |                 if a0 == 2:
169 |                     s1=s0-1
170 |                     if s1==-1 or s1==4 or s1==9 or s1==14 or s1==19:
171 |                         s1=s1+1
172 |                         r_o = -1
173 | 
174 |                     if s1==1 or s1==6 or s1==20 or s1==15 or s1==17 or s1==22:
175 |                         s1 = s1+1
176 |                         r_o = -1
177 | 
178 |                 if a0 == 3:
179 |                     s1 = s0 + 1
180 |                     if s1 == 5 or s1 == 10 or s1 == 15 or s1 == 20 or s1==25:
181 |                         s1 = s1 - 1
182 |                         r_o = -1
183 | 
184 |                     if s1 == 2 or s1 == 7 or s1 == 21 or s1 == 16 or s1 == 18 or s1 == 23:
185 |                         s1 = s1 - 1
186 |                         r_o = -1
187 | 
188 |                 if a0 == 4:
189 |                     s1 = s0
190 | 
191 |                 if a0 == 5:
192 |                     s1 = s0
193 | 
194 |                 if (s0 == pickUps[gID]) and (a0 == 5):
195 |                     shouldBreak = True
196 | 
197 |         if o0 == 4:
198 |             s1 = s0
199 | 
200 |         if o0 == 5:
201 |             s1 = s0
202 | 
203 |         if stop:
204 |             # print "tuk sam"
205 |             if o0 == 6:
206 |                 r = r_o
207 |                 # print r
208 |             # learning step
209 |             if t > 1000:
210 |                 R += r
211 | 
212 |             # print r
213 |             FullR = R + r
214 |             reward_course[t] = r
215 |             reward_mean[t] = R/float(t+1)
216 | 
217 |             V[s1, pID, gID] = np.max(Q[s1, :, pID, gID])
218 | 
219 |             if maxV < V[s1, pID, gID]:
220 |                 maxV = V[s1, pID, gID]
221 | 
222 |             time_course[t, 0] = V[s1, pID, gID]
223 |             time_course[t, 1] = eta*(r+gamma*V[s1, pID, gID])
224 |             time_course[t, 2] = (1-eta)*Q[s0, o0, pID, gID]
225 | 
226 |             Q[s0, o0, pID, gID] = (1-eta)*Q[s0, o0, pID, gID] + eta*(r + gamma*V[s1, pID, gID])
227 |         else:
228 |             V_o[s1, pID, gID] = np.max(opt_one_policy[s1, :, pID, gID])
229 |             opt_one_policy[s0, a0, pID, gID] = (1-eta)*opt_one_policy[s0, a0, pID, gID] +\
230 |                 eta*(r + gamma*V_o[s1, pID, gID])
231 |             if maxV_o < V_o[s1, pID, gID]:
232 |                 maxV_o = V_o[s1, pID, gID]
233 | 
234 |         if pID == 4:
235 |             stepNo += 1
236 | 
237 |         if (s0 == pickUps[gID]) and (o0 == 5):
238 |             stepNo = 0
239 |             break
240 | 
241 | 
242 |         s0 = s1
243 | 
244 | 
245 |     avg[t] = np.mean(np.mean(np.mean(np.mean(Q))))
246 | 
247 | 
248 | meanR = R/float(T-1000)
249 | fullMR = FullR/float(T)
250 | print meanR
251 | print fullMR
252 | print maxV
253 | print "Policy"
254 | policy = [np.max(Q[i, :, pID, gID]) for i in xrange(S)]
255 | policy_actions = [np.argmax(Q[i, :, pID, gID]) for i in xrange(S)]
256 | policy_inoption = [np.argmax(opt_one_policy[i, :, pID, gID]) for i in xrange(S)]
257 | print len(policy)
258 | policy_actions = np.reshape(policy_actions, [5, 5])
259 | policy_inoption = np.reshape(policy_inoption, [5, 5])
260 | 
261 | 
262 | for i in xrange(5):
263 |     for j in xrange(5):
264 |         print "{0} ".format(policy_actions[i, j]),
265 | 
266 | 
267 |     print " "
268 | 
269 | print "In option actions"
270 | for i in xrange(5):
271 |     for j in xrange(5):
272 |         print "{0} ".format(policy_inoption[i, j]),
273 | 
274 |     print " "
275 | 


--------------------------------------------------------------------------------
/irl/linear_irl.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements LP IRL from Ng & Russell, 2000.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | import random
  9 | 
 10 | import numpy as np
 11 | from cvxopt import matrix, solvers
 12 | 
 13 | def irl(n_states, n_actions, transition_probability, policy, discount, Rmax,
 14 |         l1):
 15 |     """
 16 |     Find a reward function with inverse RL as described in Ng & Russell, 2000.
 17 | 
 18 |     n_states: Number of states. int.
 19 |     n_actions: Number of actions. int.
 20 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
 21 |         the probability of transitioning from state_i to state_k under action.
 22 |         Shape (N, A, N).
 23 |     policy: Vector mapping state ints to action ints. Shape (N,).
 24 |     discount: Discount factor. float.
 25 |     Rmax: Maximum reward. float.
 26 |     l1: l1 regularisation. float.
 27 |     -> Reward vector
 28 |     """
 29 | 
 30 |     A = set(range(n_actions))  # Set of actions to help manage reordering
 31 |                                # actions.
 32 |     # The transition policy convention is different here to the rest of the code
 33 |     # for legacy reasons; here, we reorder axes to fix this. We expect the
 34 |     # new probabilities to be of the shape (A, N, N).
 35 |     transition_probability = np.transpose(transition_probability, (1, 0, 2))
 36 | 
 37 |     def T(a, s):
 38 |         """
 39 |         Shorthand for a dot product used a lot in the LP formulation.
 40 |         """
 41 | 
 42 |         return np.dot(transition_probability[policy[s], s] -
 43 |                       transition_probability[a, s],
 44 |                       np.linalg.inv(np.eye(n_states) -
 45 |                         discount*transition_probability[policy[s]]))
 46 | 
 47 |     # This entire function just computes the block matrices used for the LP
 48 |     # formulation of IRL.
 49 | 
 50 |     # Minimise c . x.
 51 |     c = -np.hstack([np.zeros(n_states), np.ones(n_states),
 52 |                     -l1*np.ones(n_states)])
 53 |     zero_stack1 = np.zeros((n_states*(n_actions-1), n_states))
 54 |     T_stack = np.vstack([
 55 |         -T(a, s)
 56 |         for s in range(n_states)
 57 |         for a in A - {policy[s]}
 58 |     ])
 59 |     I_stack1 = np.vstack([
 60 |         np.eye(1, n_states, s)
 61 |         for s in range(n_states)
 62 |         for a in A - {policy[s]}
 63 |     ])
 64 |     I_stack2 = np.eye(n_states)
 65 |     zero_stack2 = np.zeros((n_states, n_states))
 66 | 
 67 |     D_left = np.vstack([T_stack, T_stack, -I_stack2, I_stack2])
 68 |     D_middle = np.vstack([I_stack1, zero_stack1, zero_stack2, zero_stack2])
 69 |     D_right = np.vstack([zero_stack1, zero_stack1, -I_stack2, -I_stack2])
 70 | 
 71 |     D = np.hstack([D_left, D_middle, D_right])
 72 |     b = np.zeros((n_states*(n_actions-1)*2 + 2*n_states, 1))
 73 |     bounds = np.array([(None, None)]*2*n_states + [(-Rmax, Rmax)]*n_states)
 74 | 
 75 |     # We still need to bound R. To do this, we just add
 76 |     # -I R <= Rmax 1
 77 |     # I R <= Rmax 1
 78 |     # So to D we need to add -I and I, and to b we need to add Rmax 1 and Rmax 1
 79 |     D_bounds = np.hstack([
 80 |         np.vstack([
 81 |             -np.eye(n_states),
 82 |             np.eye(n_states)]),
 83 |         np.vstack([
 84 |             np.zeros((n_states, n_states)),
 85 |             np.zeros((n_states, n_states))]),
 86 |         np.vstack([
 87 |             np.zeros((n_states, n_states)),
 88 |             np.zeros((n_states, n_states))])])
 89 |     b_bounds = np.vstack([Rmax*np.ones((n_states, 1))]*2)
 90 |     D = np.vstack((D, D_bounds))
 91 |     b = np.vstack((b, b_bounds))
 92 |     A_ub = matrix(D)
 93 |     b = matrix(b)
 94 |     c = matrix(c)
 95 |     results = solvers.lp(c, A_ub, b)
 96 |     r = np.asarray(results["x"][:n_states], dtype=np.double)
 97 | 
 98 |     return r.reshape((n_states,))
 99 | 
100 | def v_tensor(value, transition_probability, feature_dimension, n_states,
101 |              n_actions, policy):
102 |     """
103 |     Finds the v tensor used in large linear IRL.
104 | 
105 |     value: NumPy matrix for the value function. The (i, j)th component
106 |         represents the value of the jth state under the ith basis function.
107 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
108 |         the probability of transitioning from state_i to state_k under action.
109 |         Shape (N, A, N).
110 |     feature_dimension: Dimension of the feature matrix. int.
111 |     n_states: Number of states sampled. int.
112 |     n_actions: Number of actions. int.
113 |     policy: NumPy array mapping state ints to action ints.
114 |     -> v helper tensor.
115 |     """
116 | 
117 |     v = np.zeros((n_states, n_actions-1, feature_dimension))
118 |     for i in range(n_states):
119 |         a1 = policy[i]
120 |         exp_on_policy = np.dot(transition_probability[i, a1], value.T)
121 |         seen_policy_action = False
122 |         for j in range(n_actions):
123 |             # Skip this if it's the on-policy action.
124 |             if a1 == j:
125 |                 seen_policy_action = True
126 |                 continue
127 | 
128 |             exp_off_policy = np.dot(transition_probability[i, j], value.T)
129 |             if seen_policy_action:
130 |                 v[i, j-1] = exp_on_policy - exp_off_policy
131 |             else:
132 |                 v[i, j] = exp_on_policy - exp_off_policy
133 |     return v
134 | 
135 | def large_irl(value, transition_probability, feature_matrix, n_states,
136 |               n_actions, policy):
137 |     """
138 |     Find the reward in a large state space.
139 | 
140 |     value: NumPy matrix for the value function. The (i, j)th component
141 |         represents the value of the jth state under the ith basis function.
142 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
143 |         the probability of transitioning from state_i to state_k under action.
144 |         Shape (N, A, N).
145 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
146 |         array with shape (N, D) where N is the number of states and D is the
147 |         dimensionality of the state. 
148 |     n_states: Number of states sampled. int.
149 |     n_actions: Number of actions. int.
150 |     policy: NumPy array mapping state ints to action ints.
151 |     -> Reward for each state in states.
152 |     """
153 | 
154 |     D = feature_matrix.shape[1]
155 | 
156 |     # First, calculate v, which is just a helper tensor.
157 |     v = v_tensor(value, transition_probability, D, n_states, n_actions, policy)
158 | 
159 |     # Now we can calculate c, G, h, A, and b.
160 | 
161 |     # x = [z y_i^+ y_i^- a], which is a [N (K-1)*N (K-1)*N D] vector.
162 |     x_size = n_states + (n_actions-1)*n_states*2 + D
163 | 
164 |     # c is a big stack of ones and zeros; there's N ones and the rest is zero.
165 |     c = -np.hstack([np.ones(n_states), np.zeros(x_size - n_states)])
166 |     assert c.shape[0] == x_size
167 | 
168 |     # A is [0 I_j -I_j -v^T_{ij}] and j NOT EQUAL TO policy(i).
169 |     # I believe this is accounted for by the structure of v.
170 |     A = np.hstack([
171 |         np.zeros((n_states*(n_actions-1), n_states)),
172 |         np.eye(n_states*(n_actions-1)),
173 |         -np.eye(n_states*(n_actions-1)),
174 |         np.vstack([v[i, j].T for i in range(n_states)
175 |                              for j in range(n_actions-1)])])
176 |     assert A.shape[1] == x_size
177 | 
178 |     # b is just zeros!
179 |     b = np.zeros(A.shape[0])
180 | 
181 |     # Break G up into the bottom row and other rows to construct it.
182 |     bottom_row = np.vstack([
183 |                     np.hstack([
184 |                         np.ones((n_actions-1, 1)).dot(np.eye(1, n_states, l)),
185 |                         np.hstack([-np.eye(n_actions-1) if i == l
186 |                                    else np.zeros((n_actions-1, n_actions-1))
187 |                          for i in range(n_states)]),
188 |                         np.hstack([2*np.eye(n_actions-1) if i == l
189 |                                    else np.zeros((n_actions-1, n_actions-1))
190 |                          for i in range(n_states)]),
191 |                         np.zeros((n_actions-1, D))])
192 |                     for l in range(n_states)])
193 |     assert bottom_row.shape[1] == x_size
194 |     G = np.vstack([
195 |             np.hstack([
196 |                 np.zeros((D, n_states)),
197 |                 np.zeros((D, n_states*(n_actions-1))),
198 |                 np.zeros((D, n_states*(n_actions-1))),
199 |                 np.eye(D)]),
200 |             np.hstack([
201 |                 np.zeros((D, n_states)),
202 |                 np.zeros((D, n_states*(n_actions-1))),
203 |                 np.zeros((D, n_states*(n_actions-1))),
204 |                 -np.eye(D)]),
205 |             np.hstack([
206 |                 np.zeros((n_states*(n_actions-1), n_states)),
207 |                 -np.eye(n_states*(n_actions-1)),
208 |                 np.zeros((n_states*(n_actions-1), n_states*(n_actions-1))),
209 |                 np.zeros((n_states*(n_actions-1), D))]),
210 |             np.hstack([
211 |                 np.zeros((n_states*(n_actions-1), n_states)),
212 |                 np.zeros((n_states*(n_actions-1), n_states*(n_actions-1))),
213 |                 -np.eye(n_states*(n_actions-1)),
214 |                 np.zeros((n_states*(n_actions-1), D))]),
215 |             bottom_row])
216 |     assert G.shape[1] == x_size
217 | 
218 |     h = np.vstack([np.ones((D*2, 1)),
219 |                    np.zeros((n_states*(n_actions-1)*2+bottom_row.shape[0], 1))])
220 | 
221 |     from cvxopt import matrix, solvers
222 |     c = matrix(c)
223 |     G = matrix(G)
224 |     h = matrix(h)
225 |     A = matrix(A)
226 |     b = matrix(b)
227 |     results = solvers.lp(c, G, h, A, b)
228 |     alpha = np.asarray(results["x"][-D:], dtype=np.double)
229 |     return np.dot(feature_matrix, -alpha)
230 | 


--------------------------------------------------------------------------------
/irl/maxent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements maximum entropy inverse reinforcement learning (Ziebart et al., 2008)
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | from itertools import product
  9 | 
 10 | import numpy as np
 11 | import numpy.random as rn
 12 | 
 13 | from . import value_iteration
 14 | 
 15 | def irl(feature_matrix, n_actions, discount, transition_probability,
 16 |         trajectories, epochs, learning_rate):
 17 |     """
 18 |     Find the reward function for the given trajectories.
 19 | 
 20 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
 21 |         array with shape (N, D) where N is the number of states and D is the
 22 |         dimensionality of the state.
 23 |     n_actions: Number of actions A. int.
 24 |     discount: Discount factor of the MDP. float.
 25 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
 26 |         the probability of transitioning from state_i to state_k under action.
 27 |         Shape (N, A, N).
 28 |     trajectories: 3D array of state/action pairs. States are ints, actions
 29 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 30 |         trajectories and L is the trajectory length.
 31 |     epochs: Number of gradient descent steps. int.
 32 |     learning_rate: Gradient descent learning rate. float.
 33 |     -> Reward vector with shape (N,).
 34 |     """
 35 | 
 36 |     n_states, d_states = feature_matrix.shape
 37 | 
 38 |     # Initialise weights.
 39 |     alpha = rn.uniform(size=(d_states,))
 40 | 
 41 |     # Calculate the feature expectations \tilde{phi}.
 42 |     feature_expectations = find_feature_expectations(feature_matrix,
 43 |                                                      trajectories)
 44 | 
 45 |     # Gradient descent on alpha.
 46 |     for i in range(epochs):
 47 |         # print("i: {}".format(i))
 48 |         r = feature_matrix.dot(alpha)
 49 |         expected_svf = find_expected_svf(n_states, r, n_actions, discount,
 50 |                                          transition_probability, trajectories)
 51 |         grad = feature_expectations - feature_matrix.T.dot(expected_svf)
 52 | 
 53 |         alpha += learning_rate * grad
 54 | 
 55 |     return feature_matrix.dot(alpha).reshape((n_states,))
 56 | 
 57 | def find_svf(n_states, trajectories):
 58 |     """
 59 |     Find the state visitation frequency from trajectories.
 60 | 
 61 |     n_states: Number of states. int.
 62 |     trajectories: 3D array of state/action pairs. States are ints, actions
 63 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 64 |         trajectories and L is the trajectory length.
 65 |     -> State visitation frequencies vector with shape (N,).
 66 |     """
 67 | 
 68 |     svf = np.zeros(n_states)
 69 | 
 70 |     for trajectory in trajectories:
 71 |         for state, _, _ in trajectory:
 72 |             svf[state] += 1
 73 | 
 74 |     svf /= trajectories.shape[0]
 75 | 
 76 |     return svf
 77 | 
 78 | def find_feature_expectations(feature_matrix, trajectories):
 79 |     """
 80 |     Find the feature expectations for the given trajectories. This is the
 81 |     average path feature vector.
 82 | 
 83 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
 84 |         array with shape (N, D) where N is the number of states and D is the
 85 |         dimensionality of the state.
 86 |     trajectories: 3D array of state/action pairs. States are ints, actions
 87 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 88 |         trajectories and L is the trajectory length.
 89 |     -> Feature expectations vector with shape (D,).
 90 |     """
 91 | 
 92 |     feature_expectations = np.zeros(feature_matrix.shape[1])
 93 | 
 94 |     for trajectory in trajectories:
 95 |         for state, _, _ in trajectory:
 96 |             feature_expectations += feature_matrix[state]
 97 | 
 98 |     feature_expectations /= trajectories.shape[0]
 99 | 
100 |     return feature_expectations
101 | 
102 | def find_expected_svf(n_states, r, n_actions, discount,
103 |                       transition_probability, trajectories):
104 |     """
105 |     Find the expected state visitation frequencies using algorithm 1 from
106 |     Ziebart et al. 2008.
107 | 
108 |     n_states: Number of states N. int.
109 |     alpha: Reward. NumPy array with shape (N,).
110 |     n_actions: Number of actions A. int.
111 |     discount: Discount factor of the MDP. float.
112 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
113 |         the probability of transitioning from state_i to state_k under action.
114 |         Shape (N, A, N).
115 |     trajectories: 3D array of state/action pairs. States are ints, actions
116 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
117 |         trajectories and L is the trajectory length.
118 |     -> Expected state visitation frequencies vector with shape (N,).
119 |     """
120 | 
121 |     n_trajectories = trajectories.shape[0]
122 |     trajectory_length = trajectories.shape[1]
123 | 
124 |     # policy = find_policy(n_states, r, n_actions, discount,
125 |     #                                 transition_probability)
126 |     policy = value_iteration.find_policy(n_states, n_actions,
127 |                                          transition_probability, r, discount)
128 | 
129 |     start_state_count = np.zeros(n_states)
130 |     for trajectory in trajectories:
131 |         start_state_count[trajectory[0, 0]] += 1
132 |     p_start_state = start_state_count/n_trajectories
133 | 
134 |     expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T
135 |     for t in range(1, trajectory_length):
136 |         expected_svf[:, t] = 0
137 |         for i, j, k in product(range(n_states), range(n_actions), range(n_states)):
138 |             expected_svf[k, t] += (expected_svf[i, t-1] *
139 |                                    policy[i, j] * # Stochastic policy
140 |                                    transition_probability[i, j, k])
141 | 
142 |     return expected_svf.sum(axis=1)
143 | 
144 | def softmax(x1, x2):
145 |     """
146 |     Soft-maximum calculation, from algorithm 9.2 in Ziebart's PhD thesis.
147 | 
148 |     x1: float.
149 |     x2: float.
150 |     -> softmax(x1, x2)
151 |     """
152 | 
153 |     max_x = max(x1, x2)
154 |     min_x = min(x1, x2)
155 |     return max_x + np.log(1 + np.exp(min_x - max_x))
156 | 
157 | def find_policy(n_states, r, n_actions, discount,
158 |                            transition_probability):
159 |     """
160 |     Find a policy with linear value iteration. Based on the code accompanying
161 |     the Levine et al. GPIRL paper and on Ziebart's PhD thesis (algorithm 9.1).
162 | 
163 |     n_states: Number of states N. int.
164 |     r: Reward. NumPy array with shape (N,).
165 |     n_actions: Number of actions A. int.
166 |     discount: Discount factor of the MDP. float.
167 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
168 |         the probability of transitioning from state_i to state_k under action.
169 |         Shape (N, A, N).
170 |     -> NumPy array of states and the probability of taking each action in that
171 |         state, with shape (N, A).
172 |     """
173 | 
174 |     # V = value_iteration.value(n_states, transition_probability, r, discount)
175 | 
176 |     # NumPy's dot really dislikes using inf, so I'm making everything finite
177 |     # using nan_to_num.
178 |     V = np.nan_to_num(np.ones((n_states, 1)) * float("-inf"))
179 | 
180 |     diff = np.ones((n_states,))
181 |     while (diff > 1e-4).all():  # Iterate until convergence.
182 |         new_V = r.copy()
183 |         for j in range(n_actions):
184 |             for i in range(n_states):
185 |                 new_V[i] = softmax(new_V[i], r[i] + discount*
186 |                     np.sum(transition_probability[i, j, k] * V[k]
187 |                            for k in range(n_states)))
188 | 
189 |         # # This seems to diverge, so we z-score it (engineering hack).
190 |         new_V = (new_V - new_V.mean())/new_V.std()
191 | 
192 |         diff = abs(V - new_V)
193 |         V = new_V
194 | 
195 |     # We really want Q, not V, so grab that using equation 9.2 from the thesis.
196 |     Q = np.zeros((n_states, n_actions))
197 |     for i in range(n_states):
198 |         for j in range(n_actions):
199 |             p = np.array([transition_probability[i, j, k]
200 |                           for k in range(n_states)])
201 |             Q[i, j] = p.dot(r + discount*V)
202 | 
203 |     # Softmax by row to interpret these values as probabilities.
204 |     Q -= Q.max(axis=1).reshape((n_states, 1))  # For numerical stability.
205 |     Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1))
206 |     return Q
207 | 
208 | def expected_value_difference(n_states, n_actions, transition_probability,
209 |     reward, discount, p_start_state, optimal_value, true_reward):
210 |     """
211 |     Calculate the expected value difference, which is a proxy to how good a
212 |     recovered reward function is.
213 | 
214 |     n_states: Number of states. int.
215 |     n_actions: Number of actions. int.
216 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
217 |         the probability of transitioning from state_i to state_k under action.
218 |         Shape (N, A, N).
219 |     reward: Reward vector mapping state int to reward. Shape (N,).
220 |     discount: Discount factor. float.
221 |     p_start_state: Probability vector with the ith component as the probability
222 |         that the ith state is the start state. Shape (N,).
223 |     optimal_value: Value vector for the ground reward with optimal policy.
224 |         The ith component is the value of the ith state. Shape (N,).
225 |     true_reward: True reward vector. Shape (N,).
226 |     -> Expected value difference. float.
227 |     """
228 | 
229 |     policy = value_iteration.find_policy(n_states, n_actions,
230 |         transition_probability, reward, discount)
231 |     value = value_iteration.value(policy.argmax(axis=1), n_states,
232 |         transition_probability, true_reward, discount)
233 | 
234 |     evd = optimal_value.dot(p_start_state) - value.dot(p_start_state)
235 |     return evd
236 | 


--------------------------------------------------------------------------------
/irl/mdp/gridworld.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements the gridworld MDP.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | import numpy as np
  9 | import numpy.random as rn
 10 | 
 11 | class Gridworld(object):
 12 |     """
 13 |     Gridworld MDP.
 14 |     """
 15 | 
 16 |     def __init__(self, grid_size, wind, discount):
 17 |         """
 18 |         grid_size: Grid size. int.
 19 |         wind: Chance of moving randomly. float.
 20 |         discount: MDP discount. float.
 21 |         -> Gridworld
 22 |         """
 23 | 
 24 |         self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1))
 25 |         self.n_actions = len(self.actions)
 26 |         self.n_states = grid_size**2
 27 |         self.grid_size = grid_size
 28 |         self.wind = wind
 29 |         self.discount = discount
 30 | 
 31 |         # Preconstruct the transition probability array.
 32 |         self.transition_probability = np.array(
 33 |             [[[self._transition_probability(i, j, k)
 34 |                for k in range(self.n_states)]
 35 |               for j in range(self.n_actions)]
 36 |              for i in range(self.n_states)])
 37 | 
 38 |     def __str__(self):
 39 |         return "Gridworld({}, {}, {})".format(self.grid_size, self.wind,
 40 |                                               self.discount)
 41 | 
 42 |     def feature_vector(self, i, feature_map="ident"):
 43 |         """
 44 |         Get the feature vector associated with a state integer.
 45 | 
 46 |         i: State int.
 47 |         feature_map: Which feature map to use (default ident). String in {ident,
 48 |             coord, proxi}.
 49 |         -> Feature vector.
 50 |         """
 51 | 
 52 |         if feature_map == "coord":
 53 |             f = np.zeros(self.grid_size)
 54 |             x, y = i % self.grid_size, i // self.grid_size
 55 |             f[x] += 1
 56 |             f[y] += 1
 57 |             return f
 58 |         if feature_map == "proxi":
 59 |             f = np.zeros(self.n_states)
 60 |             x, y = i % self.grid_size, i // self.grid_size
 61 |             for b in range(self.grid_size):
 62 |                 for a in range(self.grid_size):
 63 |                     dist = abs(x - a) + abs(y - b)
 64 |                     f[self.point_to_int((a, b))] = dist
 65 |             return f
 66 |         # Assume identity map.
 67 |         f = np.zeros(self.n_states)
 68 |         f[i] = 1
 69 |         return f
 70 | 
 71 |     def feature_matrix(self, feature_map="ident"):
 72 |         """
 73 |         Get the feature matrix for this gridworld.
 74 | 
 75 |         feature_map: Which feature map to use (default ident). String in {ident,
 76 |             coord, proxi}.
 77 |         -> NumPy array with shape (n_states, d_states).
 78 |         """
 79 | 
 80 |         features = []
 81 |         for n in range(self.n_states):
 82 |             f = self.feature_vector(n, feature_map)
 83 |             features.append(f)
 84 |         return np.array(features)
 85 | 
 86 |     def int_to_point(self, i):
 87 |         """
 88 |         Convert a state int into the corresponding coordinate.
 89 | 
 90 |         i: State int.
 91 |         -> (x, y) int tuple.
 92 |         """
 93 | 
 94 |         return (i % self.grid_size, i // self.grid_size)
 95 | 
 96 |     def point_to_int(self, p):
 97 |         """
 98 |         Convert a coordinate into the corresponding state int.
 99 | 
100 |         p: (x, y) tuple.
101 |         -> State int.
102 |         """
103 | 
104 |         return p[0] + p[1]*self.grid_size
105 | 
106 |     def neighbouring(self, i, k):
107 |         """
108 |         Get whether two points neighbour each other. Also returns true if they
109 |         are the same point.
110 | 
111 |         i: (x, y) int tuple.
112 |         k: (x, y) int tuple.
113 |         -> bool.
114 |         """
115 | 
116 |         return abs(i[0] - k[0]) + abs(i[1] - k[1]) <= 1
117 | 
118 |     def _transition_probability(self, i, j, k):
119 |         """
120 |         Get the probability of transitioning from state i to state k given
121 |         action j.
122 | 
123 |         i: State int.
124 |         j: Action int.
125 |         k: State int.
126 |         -> p(s_k | s_i, a_j)
127 |         """
128 | 
129 |         xi, yi = self.int_to_point(i)
130 |         xj, yj = self.actions[j]
131 |         xk, yk = self.int_to_point(k)
132 | 
133 |         if not self.neighbouring((xi, yi), (xk, yk)):
134 |             return 0.0
135 | 
136 |         # Is k the intended state to move to?
137 |         if (xi + xj, yi + yj) == (xk, yk):
138 |             return 1 - self.wind + self.wind/self.n_actions
139 | 
140 |         # If these are not the same point, then we can move there by wind.
141 |         if (xi, yi) != (xk, yk):
142 |             return self.wind/self.n_actions
143 | 
144 |         # If these are the same point, we can only move here by either moving
145 |         # off the grid or being blown off the grid. Are we on a corner or not?
146 |         if (xi, yi) in {(0, 0), (self.grid_size-1, self.grid_size-1),
147 |                         (0, self.grid_size-1), (self.grid_size-1, 0)}:
148 |             # Corner.
149 |             # Can move off the edge in two directions.
150 |             # Did we intend to move off the grid?
151 |             if not (0 <= xi + xj < self.grid_size and
152 |                     0 <= yi + yj < self.grid_size):
153 |                 # We intended to move off the grid, so we have the regular
154 |                 # success chance of staying here plus an extra chance of blowing
155 |                 # onto the *other* off-grid square.
156 |                 return 1 - self.wind + 2*self.wind/self.n_actions
157 |             else:
158 |                 # We can blow off the grid in either direction only by wind.
159 |                 return 2*self.wind/self.n_actions
160 |         else:
161 |             # Not a corner. Is it an edge?
162 |             if (xi not in {0, self.grid_size-1} and
163 |                 yi not in {0, self.grid_size-1}):
164 |                 # Not an edge.
165 |                 return 0.0
166 | 
167 |             # Edge.
168 |             # Can only move off the edge in one direction.
169 |             # Did we intend to move off the grid?
170 |             if not (0 <= xi + xj < self.grid_size and
171 |                     0 <= yi + yj < self.grid_size):
172 |                 # We intended to move off the grid, so we have the regular
173 |                 # success chance of staying here.
174 |                 return 1 - self.wind + self.wind/self.n_actions
175 |             else:
176 |                 # We can blow off the grid only by wind.
177 |                 return self.wind/self.n_actions
178 | 
179 |     def reward(self, state_int):
180 |         """
181 |         Reward for being in state state_int.
182 | 
183 |         state_int: State integer. int.
184 |         -> Reward.
185 |         """
186 | 
187 |         if state_int == self.n_states - 1:
188 |             return 1
189 |         return 0
190 | 
191 |     def average_reward(self, n_trajectories, trajectory_length, policy):
192 |         """
193 |         Calculate the average total reward obtained by following a given policy
194 |         over n_paths paths.
195 | 
196 |         policy: Map from state integers to action integers.
197 |         n_trajectories: Number of trajectories. int.
198 |         trajectory_length: Length of an episode. int.
199 |         -> Average reward, standard deviation.
200 |         """
201 | 
202 |         trajectories = self.generate_trajectories(n_trajectories,
203 |                                              trajectory_length, policy)
204 |         rewards = [[r for _, _, r in trajectory] for trajectory in trajectories]
205 |         rewards = np.array(rewards)
206 | 
207 |         # Add up all the rewards to find the total reward.
208 |         total_reward = rewards.sum(axis=1)
209 | 
210 |         # Return the average reward and standard deviation.
211 |         return total_reward.mean(), total_reward.std()
212 | 
213 |     def optimal_policy(self, state_int):
214 |         """
215 |         The optimal policy for this gridworld.
216 | 
217 |         state_int: What state we are in. int.
218 |         -> Action int.
219 |         """
220 | 
221 |         sx, sy = self.int_to_point(state_int)
222 | 
223 |         if sx < self.grid_size and sy < self.grid_size:
224 |             return rn.randint(0, 2)
225 |         if sx < self.grid_size-1:
226 |             return 0
227 |         if sy < self.grid_size-1:
228 |             return 1
229 |         raise ValueError("Unexpected state.")
230 | 
231 |     def optimal_policy_deterministic(self, state_int):
232 |         """
233 |         Deterministic version of the optimal policy for this gridworld.
234 | 
235 |         state_int: What state we are in. int.
236 |         -> Action int.
237 |         """
238 | 
239 |         sx, sy = self.int_to_point(state_int)
240 |         if sx < sy:
241 |             return 0
242 |         return 1
243 | 
244 |     def generate_trajectories(self, n_trajectories, trajectory_length, policy,
245 |                                     random_start=False):
246 |         """
247 |         Generate n_trajectories trajectories with length trajectory_length,
248 |         following the given policy.
249 | 
250 |         n_trajectories: Number of trajectories. int.
251 |         trajectory_length: Length of an episode. int.
252 |         policy: Map from state integers to action integers.
253 |         random_start: Whether to start randomly (default False). bool.
254 |         -> [[(state int, action int, reward float)]]
255 |         """
256 |         keep = []
257 |         trajectories = []
258 |         for _ in range(n_trajectories):
259 |             if random_start:
260 |                 sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size)
261 |             else:
262 |                 sx, sy = 0, 0
263 | 
264 |             trajectory = []
265 |             for _ in range(trajectory_length):
266 |                 if rn.random() < self.wind:
267 |                     action = self.actions[rn.randint(0, 4)]
268 |                 else:
269 |                     # Follow the given policy.
270 |                     keep.append(policy(self.point_to_int((sx, sy))))
271 |                     action = self.actions[policy(self.point_to_int((sx, sy)))]
272 | 
273 |                 if (0 <= sx + action[0] < self.grid_size and
274 |                         0 <= sy + action[1] < self.grid_size):
275 |                     next_sx = sx + action[0]
276 |                     next_sy = sy + action[1]
277 |                 else:
278 |                     next_sx = sx
279 |                     next_sy = sy
280 | 
281 |                 state_int = self.point_to_int((sx, sy))
282 |                 action_int = self.actions.index(action)
283 |                 next_state_int = self.point_to_int((next_sx, next_sy))
284 |                 reward = self.reward(next_state_int)
285 |                 trajectory.append((state_int, action_int, reward))
286 | 
287 |                 sx = next_sx
288 |                 sy = next_sy
289 | 
290 |             trajectories.append(trajectory)
291 | 
292 |         return np.array(trajectories)
293 | 


--------------------------------------------------------------------------------
/hierarchicalrl/sdp_value_iteration.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Find the value function associated with a policy. Based on Sutton & Barto, 1998.
  3 | 
  4 | Todor Davchev, 2017
  5 | t.b.davchev@ed.ac.uk
  6 | """
  7 | 
  8 | import numpy as np
  9 | 
 10 | 
 11 | def value(policy, n_states, transition_probabilities, reward, discount,
 12 |           threshold=1e-2):
 13 |     """
 14 |     Find the value function associated with a policy.
 15 | 
 16 |     policy: List of action ints for each state.
 17 |     n_states: Number of states. int.
 18 |     transition_probabilities: Function taking (state, action, state) to
 19 |         transition probabilities.
 20 |     reward: Vector of rewards for each state.
 21 |     discount: MDP discount factor. float.
 22 |     threshold: Convergence threshold, default 1e-2. float.
 23 |     -> Array of values for each state
 24 |     """
 25 |     v = np.zeros(n_states)
 26 | 
 27 |     diff = float("inf")
 28 |     while diff > threshold:
 29 |         diff = 0
 30 |         for s in range(n_states):
 31 |             vs = v[s]
 32 |             a = policy[s]
 33 |             v[s] = sum(transition_probabilities[s, a, k] *
 34 |                        (reward[k] + discount * v[k])
 35 |                        for k in range(n_states))
 36 |             diff = max(diff, abs(vs - v[s]))
 37 | 
 38 |     return v
 39 | 
 40 | 
 41 | def optimal_value(option_states, n_actions, transition_probabilities, reward,
 42 |                   discount, threshold=1e-2):
 43 |     """
 44 |     Find the optimal value function.
 45 | 
 46 |     n_states: Number of states. int.
 47 |     n_actions: Number of actions. int.
 48 |     transition_probabilities: Function taking (state, action, state) to
 49 |         transition probabilities.
 50 |     reward: Vector of rewards for each state.
 51 |     discount: MDP discount factor. float.
 52 |     threshold: Convergence threshold, default 1e-2. float.
 53 |     -> Array of values for each state
 54 |     """
 55 | 
 56 |     value = np.zeros(len(option_states))
 57 | 
 58 |     diff = float("inf")
 59 |     while diff > threshold:
 60 |         diff = 0
 61 |         for idx, state in enumerate(option_states):
 62 |             max_v = float("-inf")
 63 |             for action in range(n_actions):
 64 |                 transition_p = transition_probabilities[state, action, :]
 65 |                 transition_p = [
 66 |                     [
 67 |                         x for br, x in enumerate(transition_p) if br == opt_state]
 68 |                     for opt_state in option_states]
 69 |                 transition_p = np.asarray(transition_p)
 70 |                 transition_p = np.reshape(transition_p, transition_p.shape[0])
 71 |                 max_v = max(max_v, sum(
 72 |                     reward + np.dot(transition_p, (discount * value))))
 73 |                 # max_v = max(max_v, np.dot(tp, reward + discount*v))
 74 | 
 75 |             new_diff = abs(value[idx] - max_v)
 76 |             if new_diff > diff:
 77 |                 diff = new_diff
 78 |             value[idx] = max_v
 79 | 
 80 | 
 81 |         # diff = 0
 82 |         # for s in range(n_states):
 83 |         #     max_v = float("-inf")
 84 |         #     for a in range(n_actions):
 85 |         #         tp = transition_probabilities[s, a, :]
 86 |         #         max_v = max(max_v, np.dot(tp, reward + discount*v))
 87 | 
 88 |         #     new_diff = abs(v[s] - max_v)
 89 |         #     if new_diff > diff:
 90 |         #         diff = new_diff
 91 |         #     v[s] = max_v
 92 | 
 93 | 
 94 |     return value
 95 | 
 96 | 
 97 | def optimal_value_option(options_states, n_options, options_transition_probabilities,
 98 |                          reward_o, discount, threshold=1e-2):
 99 |     value_o = np.zeros(121)
100 |     diff_o = float("inf")
101 |     while diff_o > threshold:
102 |         diff_o = 0
103 |         for state in range(121):
104 |             max_vo = float("-inf")
105 |             for option in range(8):
106 |                 transition_po = options_transition_probabilities[state, option, :]
107 |                 transition_po = np.asarray(transition_po)
108 |                 transition_po = np.reshape(transition_po, transition_po.shape[0])
109 |                 #  [filter(lambda x: x in c1, sublist)
110 |                 #                 for sublist in c2]
111 |                 max_vo = max(max_vo,
112 |                     reward_o[option] + np.dot(transition_po, (discount * value_o)))
113 |                 # max_v = max(max_v, np.dot(tp, reward + discount*v))
114 | 
115 |             new_diff_o = abs(value_o[state] - max_vo)
116 |             if new_diff_o > diff_o:
117 |                 diff_o = new_diff_o
118 |             value_o[state] = max_vo
119 | 
120 |     return value_o
121 | 
122 | def optimal_option_value(option_states, n_actions, transition_probabilities, reward,
123 |                   discount, threshold=1e-2):
124 |     """
125 |     Find the optimal value function.
126 | 
127 |     n_states: Number of states. int.
128 |     n_actions: Number of actions. int.
129 |     transition_probabilities: Function taking (state, action, state) to
130 |         transition probabilities.
131 |     reward: Vector of rewards for each state.
132 |     discount: MDP discount factor. float.
133 |     threshold: Convergence threshold, default 1e-2. float.
134 |     -> Array of values for each state
135 |     """
136 | 
137 |     value = np.zeros(len(option_states))
138 | 
139 |     diff = float("inf")
140 |     while diff > threshold:
141 |         diff = 0
142 |         for idx, state in enumerate(option_states):
143 |             max_v = float("-inf")
144 |             for action in range(n_actions):
145 |                 transition_p = transition_probabilities[state, action, :]
146 |                 transition_p = [[x for br, x in enumerate(transition_p) if br == state] for state in option_states]
147 |                 transition_p = np.asarray(transition_p)
148 |                 transition_p = np.reshape(transition_p, transition_p.shape[0])
149 |                 #  [filter(lambda x: x in c1, sublist)
150 |                 #                 for sublist in c2]
151 |                 max_v = max(max_v, sum(
152 |                     reward + np.dot(transition_p[0], (discount * value))))
153 |                 # max_v = max(max_v, np.dot(tp, reward + discount*v))
154 | 
155 |             new_diff = abs(value[idx] - max_v)
156 |             if new_diff > diff:
157 |                 diff = new_diff
158 |             value[idx] = max_v
159 | 
160 |     return value
161 | 
162 | # def optimal_value(n_states, n_actions, transition_probabilities, reward,
163 | #                   discount, threshold=1e-2):
164 | #     """
165 | #     Find the optimal value function.
166 | 
167 | #     n_states: Number of states. int.
168 | #     n_actions: Number of actions. int.
169 | #     transition_probabilities: Function taking (state, action, state) to
170 | #         transition probabilities.
171 | #     reward: Vector of rewards for each state.
172 | #     discount: MDP discount factor. float.
173 | #     threshold: Convergence threshold, default 1e-2. float.
174 | #     -> Array of values for each state
175 | #     """
176 | 
177 | #     v = np.zeros(n_states)
178 | 
179 | #     diff = float("inf")
180 | #     while diff > threshold:
181 | #         diff = 0
182 | #         for s in range(n_states):
183 | #             max_v = float("-inf")
184 | #             for a in range(n_actions):
185 | #                 tp = transition_probabilities[s, a, :]
186 | #                 # max_v = max(max_v, sum(reward + np.dot(tp, discount*v)))
187 | #                 max_v = max(max_v, np.dot(tp, reward + discount*v))
188 | 
189 | #             new_diff = abs(v[s] - max_v)
190 | #             if new_diff > diff:
191 | #                 diff = new_diff
192 | #             v[s] = max_v
193 | 
194 | #     return v
195 | 
196 | 
197 | def find_option_policy(options_states, n_states, n_actions, n_options, options_transition_probabilities,
198 |                 transition_probabilities, reward_o, reward, discount,
199 |                 threshold=1e-2, value=None, stochastic=True):
200 |     q_values = []#np.zeros((len(options_states), n_states, n_actions))
201 |     if value is None:
202 |         option_value = optimal_value_option(options_states, n_options,
203 |                                                  options_transition_probabilities,
204 |                                                  reward_o, discount, threshold)
205 | 
206 |     if stochastic:
207 |         options_Q = np.zeros((121, n_options))
208 |         for i in range(121):
209 |             for j in range(n_options):
210 |                 p = options_transition_probabilities[i, j, :]
211 |                 options_Q[i, j] = reward_o[j] + p.dot(discount*option_value)
212 |         options_Q -= options_Q.max(axis=1).reshape((121, 1))  # For numerical stability.
213 |         options_Q = np.exp(options_Q)/np.exp(options_Q).sum(axis=1).reshape((121, 1))
214 |         return options_Q
215 | 
216 | def find_policy(options_states, n_states, n_actions, n_options, options_transition_probabilities,
217 |                 transition_probabilities, reward_o, reward, discount,
218 |                 threshold=1e-2, value=None, stochastic=True):
219 |     """
220 |     Find the optimal policy.
221 | 
222 |     n_states: Number of states. int.
223 |     n_actions: Number of actions. int.
224 |     transition_probabilities: Function taking (state, action, state) to
225 |         transition probabilities.
226 |     reward: Vector of rewards for each state.
227 |     discount: MDP discount factor. float.
228 |     threshold: Convergence threshold, default 1e-2. float.
229 |     v: Value function (if known). Default None.
230 |     stochastic: Whether the policy should be stochastic. Default True.
231 |     -> Action probabilities for each state or action int for each state
232 |         (depending on stochasticity).
233 |     """
234 | 
235 |     q_values = []#np.zeros((len(options_states), n_states, n_actions))
236 |     if value is None:
237 |         value = []
238 |         option_value = []
239 |         for option, option_states in enumerate(options_states):
240 |             value.append(
241 |                 optimal_value(
242 |                     option_states, n_actions, transition_probabilities[option],
243 |                     reward[option], discount, threshold))
244 | 
245 |     if stochastic:
246 |         for option, option_states in enumerate(options_states):
247 |             q_values.append(np.zeros((len(option_states), n_actions)))
248 |             # Get Q using equation 9.2 from Ziebart's thesis.
249 |             for idx, i_state in enumerate(option_states):
250 |                 for j_action in range(n_actions):
251 |                     transition_p = transition_probabilities[option, i_state, j_action, :]
252 |                     transition_p = [[
253 |                         x for br, x in enumerate(transition_p) if br == opt_state]
254 |                                     for opt_state in option_states]
255 |                     transition_p = np.asarray(transition_p)
256 |                     transition_p = np.reshape(transition_p, transition_p.shape[0])
257 |                     q_values[option][idx, j_action] = sum(reward[option] + transition_p.dot(
258 |                         discount * value[option]))
259 |             q_values[option] -= q_values[option].max(axis=1).reshape((n_states[option], 1))
260 |             # For numerical stability.
261 |             q_values[option] = np.exp(
262 |                 q_values[option]) / np.exp(q_values[option]).sum(axis=1).reshape(
263 |                     (n_states[option], 1))
264 | 
265 |         # Q = np.zeros((n_states, n_actions))
266 |         # for i in range(n_states):
267 |         #     for j in range(n_actions):
268 |         #         p = transition_probabilities[i, j, :]
269 |         #         Q[i, j] = p.dot(reward + discount*v)
270 |         # Q -= Q.max(axis=1).reshape((n_states, 1))  # For numerical stability.
271 |         # Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1))
272 |         # return Q
273 | 
274 |         return q_values
275 | 
276 |     # def _policy(s):
277 |     #     return max(range(n_actions),
278 |     #                key=lambda a: sum(transition_probabilities[s, a, k] *
279 |     #                                  (reward[k] + discount * v[k])
280 |     #                                  for k in range(n_states)))
281 |     # policy = np.array([_policy(s) for s in range(n_states)])
282 |     # return policy
283 | 
284 | 
285 | if __name__ == '__main__':
286 |     # Quick unit test using gridworld.
287 |     import mdp.gridworld as gridworld
288 |     gw = gridworld.Gridworld(3, 0.3, 0.9)
289 |     v = value([gw.optimal_policy_deterministic(s) for s in range(gw.n_states)],
290 |               gw.n_states,
291 |               gw.transition_probability,
292 |               [gw.reward(s) for s in range(gw.n_states)],
293 |               gw.discount)
294 |     assert np.isclose(v,
295 |                       [5.7194282, 6.46706692, 6.42589811,
296 |                        6.46706692, 7.47058224, 7.96505174,
297 |                        6.42589811, 7.96505174, 8.19268666], 1).all()
298 |     opt_v = optimal_value(gw.n_states,
299 |                           gw.n_actions,
300 |                           gw.transition_probability,
301 |                           [gw.reward(s) for s in range(gw.n_states)],
302 |                           gw.discount)
303 |     assert np.isclose(v, opt_v).all()
304 | 


--------------------------------------------------------------------------------
/irl/deep_maxent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements deep maximum entropy inverse reinforcement learning based on
  3 | Ziebart et al., 2008 and Wulfmeier et al., 2015, using symbolic methods with
  4 | Theano.
  5 | 
  6 | Matthew Alger, 2015
  7 | matthew.alger@anu.edu.au
  8 | """
  9 | 
 10 | from itertools import product
 11 | 
 12 | import numpy as np
 13 | import numpy.random as rn
 14 | import theano as th
 15 | import theano.tensor as T
 16 | 
 17 | from . import maxent
 18 | 
 19 | FLOAT = th.config.floatX
 20 | 
 21 | def find_svf(n_states, trajectories):
 22 |     """
 23 |     Find the state vistiation frequency from trajectories.
 24 | 
 25 |     n_states: Number of states. int.
 26 |     trajectories: 3D array of state/action pairs. States are ints, actions
 27 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 28 |         trajectories and L is the trajectory length.
 29 |     -> State visitation frequencies vector with shape (N,).
 30 |     """
 31 | 
 32 |     svf = np.zeros(n_states)
 33 | 
 34 |     for trajectory in trajectories:
 35 |         for state, _, _ in trajectory:
 36 |             svf[state] += 1
 37 | 
 38 |     svf /= trajectories.shape[0]
 39 | 
 40 |     return th.shared(svf, "svf", borrow=True)
 41 | 
 42 | def optimal_value(n_states, n_actions, transition_probabilities, reward,
 43 |                   discount, threshold=1e-2):
 44 |     """
 45 |     Find the optimal value function.
 46 | 
 47 |     n_states: Number of states. int.
 48 |     n_actions: Number of actions. int.
 49 |     transition_probabilities: Function taking (state, action, state) to
 50 |         transition probabilities.
 51 |     reward: Vector of rewards for each state.
 52 |     discount: MDP discount factor. float.
 53 |     threshold: Convergence threshold, default 1e-2. float.
 54 |     -> Array of values for each state
 55 |     """
 56 | 
 57 |     v = T.zeros(n_states, dtype=FLOAT)
 58 | 
 59 |     def update(s, prev_diff, v, reward, tps):
 60 |         max_v = float("-inf")
 61 |         v_template = T.zeros_like(v)
 62 |         for a in range(n_actions):
 63 |             tp = tps[s, a, :]
 64 |             max_v = T.largest(max_v, T.dot(tp, reward + discount*v))
 65 |         new_diff = abs(v[s] - max_v)
 66 |         if T.lt(prev_diff, new_diff):
 67 |             diff = new_diff
 68 |         else:
 69 |             diff = prev_diff
 70 |         return (diff, T.set_subtensor(v_template[s], max_v)), {}
 71 | 
 72 |     def until_converged(diff, v):
 73 |         (diff, vs), _ = th.scan(
 74 |                 fn=update,
 75 |                 outputs_info=[{"initial": diff, "taps": [-1]},
 76 |                               None],
 77 |                 sequences=[T.arange(n_states)],
 78 |                 non_sequences=[v, reward, transition_probabilities])
 79 |         return ((diff[-1], vs.sum(axis=0)), {},
 80 |                 th.scan_module.until(diff[-1] < threshold))
 81 | 
 82 |     (_, vs), _ = th.scan(fn = until_converged,
 83 |                          outputs_info=[
 84 |                             # Need to force an inf into the right Theano
 85 |                             # data type and this seems to be the only way that
 86 |                             # works.
 87 |                             {"initial": getattr(np, FLOAT)(float("inf")),
 88 |                              "taps": [-1]},
 89 |                             {"initial": v,
 90 |                              "taps": [-1]}],
 91 |                          n_steps=1000)
 92 | 
 93 |     return vs[-1]
 94 | 
 95 | def find_policy(n_states, n_actions, transition_probabilities, reward, discount,
 96 |                 threshold=1e-2, v=None):
 97 |     """
 98 |     Find the optimal policy.
 99 | 
100 |     n_states: Number of states. int.
101 |     n_actions: Number of actions. int.
102 |     transition_probabilities: Function taking (state, action, state) to
103 |         transition probabilities.
104 |     reward: Vector of rewards for each state.
105 |     discount: MDP discount factor. float.
106 |     threshold: Convergence threshold, default 1e-2. float.
107 |     v: Optimal value array (if known). Default None.
108 |     -> Action probabilities for each state.
109 |     """
110 | 
111 |     if v is None:
112 |         v = optimal_value(n_states, n_actions, transition_probabilities, reward,
113 |                           discount, threshold)
114 | 
115 |     # Get Q using equation 9.2 from Ziebart's thesis.
116 |     Q = T.zeros((n_states, n_actions))
117 |     def make_Q(i, j, tps, Q, reward, v):
118 |         Q_template = T.zeros_like(Q)
119 |         tp = transition_probabilities[i, j, :]
120 |         return T.set_subtensor(Q_template[i, j], tp.dot(reward + discount*v)),{}
121 | 
122 |     prod = np.array(list(product(range(n_states), range(n_actions))))
123 |     state_range = th.shared(prod[:, 0])
124 |     action_range = th.shared(prod[:, 1])
125 |     Qs, _ = th.scan(fn=make_Q,
126 |                     outputs_info=None,
127 |                     sequences=[state_range, action_range],
128 |                     non_sequences=[transition_probabilities, Q, reward, v])
129 |     Q = Qs.sum(axis=0)
130 |     Q -= Q.max(axis=1).reshape((n_states, 1))  # For numerical stability.
131 |     Q = T.exp(Q)/T.exp(Q).sum(axis=1).reshape((n_states, 1))
132 |     return Q
133 | 
134 | def find_expected_svf(n_states, r, n_actions, discount,
135 |                       transition_probability, trajectories):
136 |     """
137 |     Find the expected state visitation frequencies using algorithm 1 from
138 |     Ziebart et al. 2008.
139 | 
140 |     n_states: Number of states N. int.
141 |     alpha: Reward. NumPy array with shape (N,).
142 |     n_actions: Number of actions A. int.
143 |     discount: Discount factor of the MDP. float.
144 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
145 |         the probability of transitioning from state_i to state_k under action.
146 |         Shape (N, A, N).
147 |     trajectories: 3D array of state/action pairs. States are ints, actions
148 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
149 |         trajectories and L is the trajectory length.
150 |     -> Expected state visitation frequencies vector with shape (N,).
151 |     """
152 | 
153 |     n_trajectories = trajectories.shape[0]
154 |     trajectory_length = trajectories.shape[1]
155 | 
156 |     policy = find_policy(n_states, n_actions,
157 |                          transition_probability, r, discount)
158 | 
159 |     start_state_count = T.extra_ops.bincount(trajectories[:, 0, 0],
160 |                                              minlength=n_states)
161 |     p_start_state = start_state_count.astype(FLOAT)/n_trajectories
162 | 
163 |     def state_visitation_step(i, j, prev_svf, policy, tps):
164 |         """
165 |         The sum of the outputs of a scan over this will be a row of the svf.
166 |         """
167 | 
168 |         svf = prev_svf[i] * policy[i, j] * tps[i, j, :]
169 |         return svf, {}
170 | 
171 |     prod = np.array(list(product(range(n_states), range(n_actions))))
172 |     state_range = th.shared(prod[:, 0])
173 |     action_range = th.shared(prod[:, 1])
174 |     def state_visitation_row(prev_svf, policy, tps, state_range, action_range):
175 |         svf_t, _ = th.scan(fn=state_visitation_step,
176 |                            sequences=[state_range, action_range],
177 |                            non_sequences=[prev_svf, policy, tps])
178 |         svf_t = svf_t.sum(axis=0)
179 |         return svf_t, {}
180 | 
181 |     svf, _ = th.scan(fn=state_visitation_row,
182 |                      outputs_info=[{"initial": p_start_state, "taps": [-1]}],
183 |                      n_steps=trajectories.shape[1]-1,
184 |                      non_sequences=[policy, transition_probability, state_range,
185 |                                  action_range])
186 | 
187 |     return svf.sum(axis=0) + p_start_state
188 | 
189 | def irl(structure, feature_matrix, n_actions, discount, transition_probability,
190 |         trajectories, epochs, learning_rate, initialisation="normal", l1=0.1,
191 |         l2=0.1):
192 |     """
193 |     Find the reward function for the given trajectories.
194 | 
195 |     structure: Neural network structure tuple, e.g. (10, 3, 3) would be a
196 |         3-layer neural network with 10 inputs.
197 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
198 |         array with shape (N, D) where N is the number of states and D is the
199 |         dimensionality of the state.
200 |     n_actions: Number of actions A. int.
201 |     discount: Discount factor of the MDP. float.
202 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
203 |         the probability of transitioning from state_i to state_k under action.
204 |         Shape (N, A, N).
205 |     trajectories: 3D array of state/action pairs. States are ints, actions
206 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
207 |         trajectories and L is the trajectory length.
208 |     epochs: Number of gradient descent steps. int.
209 |     learning_rate: Gradient descent learning rate. float.
210 |     initialisation: What distribution to use. str in {normal, uniform}. Default
211 |         normal.
212 |     l1: L1 regularisation. Default 0.1. float.
213 |     l2: L2 regularisation. Default 0.1. float.
214 |     -> Reward vector with shape (N,).
215 |     """
216 | 
217 |     n_states, d_states = feature_matrix.shape
218 |     transition_probability = th.shared(transition_probability, borrow=True)
219 |     trajectories = th.shared(trajectories, borrow=True)
220 | 
221 |     # Initialise W matrices; b biases.
222 |     n_layers = len(structure)-1
223 |     weights = []
224 |     hist_w_grads = []  # For AdaGrad.
225 |     biases = []
226 |     hist_b_grads = []  # For AdaGrad.
227 |     for i in range(n_layers):
228 |         # W
229 |         shape = (structure[i+1], structure[i])
230 |         if initialisation == "normal":
231 |             matrix = th.shared(rn.normal(size=shape), name="W", borrow=True)
232 |         else:
233 |             matrix = th.shared(rn.uniform(size=shape), name="W", borrow=True)
234 |         weights.append(matrix)
235 |         hist_w_grads.append(th.shared(np.zeros(shape), name="hdW", borrow=True))
236 | 
237 |         # b
238 |         shape = (structure[i+1], 1)
239 |         if initialisation == "normal":
240 |             matrix = th.shared(rn.normal(size=shape), name="b", borrow=True)
241 |         else:
242 |             matrix = th.shared(rn.uniform(size=shape), name="b", borrow=True)
243 |         biases.append(matrix)
244 |         hist_b_grads.append(th.shared(np.zeros(shape), name="hdb", borrow=True))
245 | 
246 |     # Initialise α weight, β bias.
247 |     if initialisation == "normal":
248 |         α = th.shared(rn.normal(size=(1, structure[-1])), name="alpha",
249 |                       borrow=True)
250 |     else:
251 |         α = th.shared(rn.uniform(size=(1, structure[-1])), name="alpha",
252 |                       borrow=True)
253 |     hist_α_grad = T.zeros(α.shape)  # For AdaGrad.
254 | 
255 |     adagrad_epsilon = 1e-6  # AdaGrad numerical stability.
256 | 
257 |     #### Theano symbolic setup. ####
258 | 
259 |     # Symbolic input.
260 |     s_feature_matrix = T.matrix("x")
261 |     # Feature matrices.
262 |     # All dimensions of the form (d_layer, n_states).
263 |     φs = [s_feature_matrix.T]
264 |     # Forward propagation.
265 |     for W, b in zip(weights, biases):
266 |         φ = T.nnet.sigmoid(th.compile.ops.Rebroadcast((0, False), (1, True))(b)
267 |                            + W.dot(φs[-1]))
268 |         φs.append(φ)
269 |         # φs[1] = φ1 etc.
270 |     # Reward.
271 |     r = α.dot(φs[-1]).reshape((n_states,))
272 |     # Engineering hack: z-score the reward.
273 |     r = (r - r.mean())/r.std()
274 |     # Associated feature expectations.
275 |     expected_svf = find_expected_svf(n_states, r,
276 |                                      n_actions, discount,
277 |                                      transition_probability,
278 |                                      trajectories)
279 |     svf = maxent.find_svf(n_states, trajectories.get_value())
280 |     # Derivatives (backward propagation).
281 |     updates = []
282 |     α_grad = φs[-1].dot(svf - expected_svf).T
283 |     hist_α_grad += α_grad**2
284 |     adj_α_grad = α_grad/(adagrad_epsilon + T.sqrt(hist_α_grad))
285 |     updates.append((α, α + adj_α_grad*learning_rate))
286 | 
287 |     def grad_for_state(s, theta, svf_diff, r):
288 |         """
289 |         Calculate the gradient with respect to theta for one state.
290 |         """
291 | 
292 |         regularisation = abs(theta).sum()*l1 + (theta**2).sum()*l2
293 |         return svf_diff[s] * T.grad(r[s], theta) - regularisation, {}
294 | 
295 |     for i, W in enumerate(weights):
296 |         w_grads, _ = th.scan(fn=grad_for_state,
297 |                              sequences=[T.arange(n_states)],
298 |                              non_sequences=[W, svf - expected_svf, r])
299 |         w_grad = w_grads.sum(axis=0)
300 |         hist_w_grads[i] += w_grad**2
301 |         adj_w_grad = w_grad/(adagrad_epsilon + T.sqrt(hist_w_grads[i]))
302 |         updates.append((W, W + adj_w_grad*learning_rate))
303 |     for i, b in enumerate(biases):
304 |         b_grads, _ = th.scan(fn=grad_for_state,
305 |                              sequences=[T.arange(n_states)],
306 |                              non_sequences=[b, svf - expected_svf, r])
307 |         b_grad = b_grads.sum(axis=0)
308 |         hist_b_grads[i] += b_grad**2
309 |         adj_b_grad = b_grad/(adagrad_epsilon + T.sqrt(hist_b_grads[i]))
310 |         updates.append((b, b + adj_b_grad*learning_rate))
311 | 
312 |     train = th.function([s_feature_matrix], updates=updates, outputs=r)
313 |     run = th.function([s_feature_matrix], outputs=r)
314 | 
315 |     for e in range(epochs):
316 |         reward = train(feature_matrix)
317 | 
318 |     return reward.reshape((n_states,))
319 | 


--------------------------------------------------------------------------------
/hierarchicalrl/sdp_maxent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements Semi-MDP maximum entropy inverse reinforcement learning (Ziebart et al., 2008)
  3 | 
  4 | Todor Davchev, 2017
  5 | t.b.davchev@ed.ac.uk
  6 | """
  7 | 
  8 | from itertools import product
  9 | 
 10 | import numpy as np
 11 | import numpy.random as rn
 12 | 
 13 | import sdp_value_iteration as value_iteration
 14 | 
 15 | def irl(options_states, features_matrix, o_feature_matrix, n_actions, n_options, discount,
 16 |         options_transition_probability, transition_probability,
 17 |         trajectories, global_trajectories, epochs, learning_rate, int_to_point, options):
 18 |     """
 19 |     Find the reward function for the given trajectories.
 20 | 
 21 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
 22 |         array with shape (N, D) where N is the number of states and D is the
 23 |         dimensionality of the state.
 24 |     n_actions: Number of actions A. int.
 25 |     discount: Discount factor of the MDP. float.
 26 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
 27 |         the probability of transitioning from state_i to state_k under action.
 28 |         Shape (N, A, N).
 29 |     trajectories: 3D array of state/action pairs. States are ints, actions
 30 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 31 |         trajectories and L is the trajectory length.
 32 |     epochs: Number of gradient descent steps. int.
 33 |     learning_rate: Gradient descent learning rate. float.
 34 |     -> Reward vector with shape (N,).
 35 |     """
 36 | 
 37 |     n_states = [np.asarray(i).shape[0] for i in features_matrix]
 38 |     d_states = [np.asarray(i).shape[1] for i in features_matrix]
 39 |     on_states = o_feature_matrix.shape[0]
 40 |     od_states = o_feature_matrix.shape[1]
 41 |     # n_states, d_states = [features_matrix[i].shape for i in xrange(features_matrix)]
 42 | 
 43 |     # Initialise weights.
 44 |     alpha = [rn.uniform(size=(d_st,)) for d_st in d_states]
 45 |     o_alpha = rn.uniform(size=(od_states,))
 46 | 
 47 |     # option = 0
 48 |     # Calculate the feature expectations \tilde{phi}.
 49 | 
 50 |     # change the samples to go from option through option etc ...
 51 |     feature_expectations, options_feature_expectations = find_feature_expectations(
 52 |         features_matrix, o_feature_matrix, trajectories, global_trajectories, options_states)
 53 | 
 54 |     # Gradient descent on alpha.
 55 |     for i in range(epochs):
 56 |         # print("i: {}".format(i))
 57 |         r = np.asarray([np.asarray(features_matrix[opt]).dot(alpha[opt]) for opt in range(n_options)])
 58 |         r_o = o_feature_matrix.dot(o_alpha)
 59 |         expected_svf, options_expected_svf = find_expected_svf(
 60 |             options_states, on_states, n_states, r_o, r,
 61 |             n_actions, n_options, discount, options_transition_probability,
 62 |             transition_probability, trajectories, global_trajectories)
 63 |         #not for 0 only but for all options
 64 |         modif_expected_svf = [
 65 |             [
 66 |                 [
 67 |                     item for idx, item in enumerate(expected_svf[opt]) if idx == opt_state]
 68 |                 for opt_state in options_states[opt]]
 69 |             for opt in range(n_options)]
 70 |         grad = [feature_expectations[opt] - np.asarray(features_matrix[opt]).T.dot(modif_expected_svf[opt]).reshape((n_states[opt],)) for opt in range(n_options)]
 71 |         modif_opt_exp_svf = [[value for idx, value in enumerate(options_expected_svf) if int_to_point(idx) == opt["termination"]][0] for opt in options]
 72 |         o_grad = options_feature_expectations - o_feature_matrix.T.dot(modif_opt_exp_svf)
 73 | 
 74 |         alpha += [learning_rate * grad[opt] for opt in range(n_options)]
 75 |         o_alpha += learning_rate * o_grad
 76 | 
 77 |     return [np.asarray(features_matrix[opt]).dot(alpha[opt]).reshape((n_states[opt],)) for opt in range(n_options)],\
 78 |         o_feature_matrix.dot(o_alpha).reshape((n_options,))
 79 | 
 80 | def find_svf(n_states, trajectories):
 81 |     """
 82 |     Find the state visitation frequency from trajectories.
 83 | 
 84 |     n_states: Number of states. int.
 85 |     trajectories: 3D array of state/action pairs. States are ints, actions
 86 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 87 |         trajectories and L is the trajectory length.
 88 |     -> State visitation frequencies vector with shape (N,).
 89 |     """
 90 | 
 91 |     svf = np.zeros(n_states)
 92 | 
 93 |     for trajectory in trajectories:
 94 |         for state, _, _ in trajectory:
 95 |             svf[state] += 1
 96 | 
 97 |     svf /= trajectories.shape[0]
 98 | 
 99 |     return svf
100 | 
101 | def find_feature_expectations(feature_matrix, o_feature_matrix, trajectories, global_trajectories, options_states):
102 |     """
103 |     Find the feature expectations for the given trajectories. This is the
104 |     average path feature vector.
105 | 
106 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
107 |         array with shape (N, D) where N is the number of states and D is the
108 |         dimensionality of the state.
109 |     trajectories: 3D array of state/action pairs. States are ints, actions
110 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
111 |         trajectories and L is the trajectory length.
112 |     -> Feature expectations vector with shape (D,).
113 |     """
114 |     option_feature_expectations = np.zeros(len(o_feature_matrix)) # kolko optioni ima i v koi e bil nai-mnogo
115 |     feature_expectations = [np.zeros(len(feature_matrix[i])) for i in xrange(len(options_states))]
116 |     for br, option_states in enumerate(options_states):
117 |         for trajectory in trajectories[br]:
118 |             # for state, _, _ in trajectory:
119 |             for traj_id in trajectory:
120 |                 feature_expectations[br] += feature_matrix[br][
121 |                     [idx for idx, state in enumerate(option_states) if state == traj_id[0]][0]]
122 | 
123 |         feature_expectations[br] /= trajectories[br].shape[0]
124 | 
125 |     for global_traj in global_trajectories:
126 |         for option_used in global_traj:
127 |             option_feature_expectations += o_feature_matrix[option_used[1]]
128 | 
129 |     option_feature_expectations /= global_trajectories.shape[0]
130 | 
131 |     return feature_expectations, option_feature_expectations
132 | 
133 | def find_expected_svf(options_states, on_states, n_states, r_o, r, n_actions, n_options, discount,
134 |                       options_transition_probability, transition_probability, trajectories,
135 |                       global_trajectories):
136 |     """
137 |     Find the expected state visitation frequencies using algorithm 1 from
138 |     Ziebart et al. 2008.
139 | 
140 |     n_states: Number of states N. int.
141 |     alpha: Reward. NumPy array with shape (N,).
142 |     n_actions: Number of actions A. int.
143 |     discount: Discount factor of the MDP. float.
144 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
145 |         the probability of transitioning from state_i to state_k under action.
146 |         Shape (N, A, N).
147 |     trajectories: 3D array of state/action pairs. States are ints, actions
148 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
149 |         trajectories and L is the trajectory length.
150 |     -> Expected state visitation frequencies vector with shape (N,).
151 |     """
152 | 
153 |     n_trajectories = trajectories[0].shape[0]
154 |     trajectory_lengths = trajectories[0].shape[1]
155 | 
156 |     # policy = find_policy(n_states, r, n_actions, discount,
157 |     #                                 transition_probability)
158 | 
159 |     # policy = [[] for _ in range(len(options_states))]
160 |     policy = value_iteration.find_policy(options_states, n_states, n_actions, n_options,
161 |                                          options_transition_probability,
162 |                                          transition_probability, r_o, r, discount)
163 | 
164 |     options_policy = value_iteration.find_option_policy(
165 |         options_states, n_states, n_actions, n_options, options_transition_probability,
166 |         transition_probability, r_o, r, discount)
167 | 
168 |     # option-to-option
169 |     opt_start_state_count = np.zeros(121)
170 |     global_trajectory_length = 0
171 |     length = 0
172 |     for gl_trajectory in global_trajectories:
173 |         count = 0
174 |         opt_start_state_count[gl_trajectory[0][0][0]] += 1
175 |         for trajectory in gl_trajectory:
176 |             count += len(trajectories[0])
177 | 
178 |         if count > length:
179 |             length = count
180 | 
181 |     op_start_state = opt_start_state_count/n_trajectories
182 | 
183 |     opt_expected_svf = np.tile(op_start_state, (length, 1)).T
184 |     for t in range(1, length):
185 |         opt_expected_svf[:, t] = 0
186 |         for i, j, k in product(range(121), range(n_options), range(121)):
187 |             opt_expected_svf[k, t] += (opt_expected_svf[i, t-1] *
188 |                                   options_policy[i, j] * # Stochastic policy
189 |                                   options_transition_probability[i, j, k])
190 | 
191 |     options_result = opt_expected_svf.sum(axis=1)
192 | 
193 |     # intra-options
194 |     start_state_count = np.zeros((8, 121))
195 |     p_start_state = []
196 |     for option in range(n_options):
197 |         for trajectory in trajectories[option]:
198 |             start_state_count[option][trajectory[0, 0]] += 1
199 |         p_start_state.append(start_state_count[option]/n_trajectories)
200 |     result = []
201 |     expected_svf = [np.tile(p_start_state[opt], (trajectory_lengths, 1)).T for opt in range(len(options_states))]
202 |     ids = [[
203 |             56, 45, 44, 46, 47, 48,
204 |             33, 34, 35, 36, 37,
205 |             22, 23, 24, 25, 26, 27,
206 |             11, 12, 13, 14, 15,
207 |             0, 1, 2, 3, 4
208 |         ],
209 |         [
210 |             27, 26, 15, 4, 37, 48,
211 |             3, 14, 25, 36, 47,
212 |             2, 13, 24, 35, 46,
213 |             1, 12, 23, 34, 45,
214 |             0, 11, 22, 33, 44, 56
215 |         ],
216 |         [
217 |             27, 28, 17, 39, 17, 6, 39, 50, 61,
218 |             62, 51, 40, 29, 18, 7,
219 |             8, 19, 30, 41, 52, 63, 74,
220 |             9, 20, 31, 42, 53, 64,
221 |             10, 21, 32, 43, 54, 65
222 |         ],
223 |         [
224 |             74, 63, 62, 61, 64, 65,
225 |             50, 51, 52, 53, 54,
226 |             39, 40, 41, 42, 43,
227 |             32, 31, 30, 29, 28, 27,
228 |             17, 18, 19, 20, 21, 6, 7, 8, 9, 10
229 |         ],
230 |         [
231 |             74, 85, 84, 83, 86, 87,
232 |             94, 95, 96, 97, 98,
233 |             109, 108, 107, 106, 105, 104,
234 |             116, 117, 118, 119, 120
235 |         ],
236 |         [
237 |             104, 105, 116, 94, 83,
238 |             84, 95, 106, 117,
239 |             118, 107, 96, 85, 74,
240 |             86, 97, 108, 119,
241 |             87, 98, 109, 120
242 |         ],
243 |         [
244 |             104, 103, 114, 92, 81, 70,
245 |             69, 80, 91, 102, 113,
246 |             68, 79, 90, 101, 112,
247 |             67, 56, 78, 89, 100, 111,
248 |             66, 77, 88, 99, 110
249 |         ],
250 |         [
251 |             56, 67, 66, 68, 69, 70,
252 |             77, 78, 79, 80, 81,
253 |             88, 89, 90, 91, 92,
254 |             99, 100, 101, 102, 103, 104,
255 |             110, 111, 112, 113, 114
256 |         ]]
257 |     for o in range(len(ids)):
258 |         for t in range(1, trajectory_lengths):
259 |             expected_svf[o][:, t] = 0
260 |             for i, j, k in product(ids[o], range(n_actions),ids[o]):
261 |                 if i in options_states[o]:
262 |                     idme = [idx for idx, state in enumerate(options_states[o]) if state == i][0]
263 |                     # Stochastic policy
264 |                     expected_svf[o][k, t] += (expected_svf[o][i, t-1] * policy[o][idme, j] *
265 |                                               transition_probability[o][i, j, k])
266 |                 else:
267 |                     expected_svf[o][k, t] = 0
268 | 
269 |         result.append(expected_svf[o].sum(axis=1))
270 | 
271 |     return result, options_result
272 | 
273 | def softmax(x1, x2):
274 |     """
275 |     Soft-maximum calculation, from algorithm 9.2 in Ziebart's PhD thesis.
276 | 
277 |     x1: float.
278 |     x2: float.
279 |     -> softmax(x1, x2)
280 |     """
281 | 
282 |     max_x = max(x1, x2)
283 |     min_x = min(x1, x2)
284 |     return max_x + np.log(1 + np.exp(min_x - max_x))
285 | 
286 | def find_policy(n_states, r, n_actions, discount,
287 |                            transition_probability):
288 |     """
289 |     Find a policy with linear value iteration. Based on the code accompanying
290 |     the Levine et al. GPIRL paper and on Ziebart's PhD thesis (algorithm 9.1).
291 | 
292 |     n_states: Number of states N. int.
293 |     r: Reward. NumPy array with shape (N,).
294 |     n_actions: Number of actions A. int.
295 |     discount: Discount factor of the MDP. float.
296 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
297 |         the probability of transitioning from state_i to state_k under action.
298 |         Shape (N, A, N).
299 |     -> NumPy array of states and the probability of taking each action in that
300 |         state, with shape (N, A).
301 |     """
302 | 
303 |     # V = value_iteration.value(n_states, transition_probability, r, discount)
304 | 
305 |     # NumPy's dot really dislikes using inf, so I'm making everything finite
306 |     # using nan_to_num.
307 |     V = np.nan_to_num(np.ones((n_states, 1)) * float("-inf"))
308 | 
309 |     diff = np.ones((n_states,))
310 |     while (diff > 1e-4).all():  # Iterate until convergence.
311 |         new_V = r.copy()
312 |         for j in range(n_actions):
313 |             for i in range(n_states):
314 |                 new_V[i] = softmax(new_V[i], r[i] + discount*
315 |                     np.sum(transition_probability[i, j, k] * V[k]
316 |                            for k in range(n_states)))
317 | 
318 |         # # This seems to diverge, so we z-score it (engineering hack).
319 |         new_V = (new_V - new_V.mean())/new_V.std()
320 | 
321 |         diff = abs(V - new_V)
322 |         V = new_V
323 | 
324 |     # We really want Q, not V, so grab that using equation 9.2 from the thesis.
325 |     Q = np.zeros((n_states, n_actions))
326 |     for i in range(n_states):
327 |         for j in range(n_actions):
328 |             p = np.array([transition_probability[i, j, k]
329 |                           for k in range(n_states)])
330 |             Q[i, j] = p.dot(r + discount*V)
331 | 
332 |     # Softmax by row to interpret these values as probabilities.
333 |     Q -= Q.max(axis=1).reshape((n_states, 1))  # For numerical stability.
334 |     Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1))
335 |     return Q
336 | 
337 | def expected_value_difference(n_states, n_actions, transition_probability,
338 |     reward, discount, p_start_state, optimal_value, true_reward):
339 |     """
340 |     Calculate the expected value difference, which is a proxy to how good a
341 |     recovered reward function is.
342 | 
343 |     n_states: Number of states. int.
344 |     n_actions: Number of actions. int.
345 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
346 |         the probability of transitioning from state_i to state_k under action.
347 |         Shape (N, A, N).
348 |     reward: Reward vector mapping state int to reward. Shape (N,).
349 |     discount: Discount factor. float.
350 |     p_start_state: Probability vector with the ith component as the probability
351 |         that the ith state is the start state. Shape (N,).
352 |     optimal_value: Value vector for the ground reward with optimal policy.
353 |         The ith component is the value of the ith state. Shape (N,).
354 |     true_reward: True reward vector. Shape (N,).
355 |     -> Expected value difference. float.
356 |     """
357 | 
358 |     policy = value_iteration.find_policy(n_states, n_actions,
359 |         transition_probability, reward, discount)
360 |     value = value_iteration.value(policy.argmax(axis=1), n_states,
361 |         transition_probability, true_reward, discount)
362 | 
363 |     evd = optimal_value.dot(p_start_state) - value.dot(p_start_state)
364 |     return evd
365 | 


--------------------------------------------------------------------------------
/examples/experiments.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Perform the experiments from the report.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | from time import time
  9 | from sys import stdout
 10 | 
 11 | import sys
 12 | sys.path.append("/home/todor/Documents/workspace/smdp")
 13 | 
 14 | import numpy as np
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | from irl import maxent
 18 | from irl import deep_maxent
 19 | from irl import value_iteration
 20 | from irl.mdp.gridworld import Gridworld
 21 | from irl.mdp.objectworld import Objectworld
 22 | 
 23 | def test_gw_once(grid_size, feature_map, n_samples, epochs, structure):
 24 |     """
 25 |     Test MaxEnt and DeepMaxEnt on a gw of size grid_size with the feature
 26 |     map feature_map with n_samples paths.
 27 | 
 28 |     grid_size: Grid size. int.
 29 |     feature_map: Which feature map to use. String in {ident, coord, proxi}.
 30 |     n_samples: Number of paths to sample.
 31 |     epochs: Number of epochs to run MaxEnt with.
 32 |     structure: Neural network structure tuple, e.g. (3, 3) would be a
 33 |         3-layer neural network with assumed inputs.
 34 |     -> Expected value difference for MaxEnt, DeepMaxEnt
 35 |     """
 36 | 
 37 |     # Basic gist of what we're doing here: Get the reward function using our
 38 |     # different IRL methods, use those to get a policy, evaluate that policy
 39 |     # using the true reward, and then return the difference in expected values.
 40 | 
 41 |     # Setup parameters.
 42 |     wind = 0.3
 43 |     discount = 0.9
 44 |     learning_rate = 0.01
 45 |     trajectory_length = 3*grid_size
 46 | 
 47 |     # Make the gridworld and associated data.
 48 |     gw = Gridworld(grid_size, wind, discount)
 49 |     feature_matrix = gw.feature_matrix(feature_map)
 50 |     ground_reward = np.array([gw.reward(i) for i in range(gw.n_states)])
 51 |     optimal_policy = value_iteration.find_policy(gw.n_states,
 52 |                                                  gw.n_actions,
 53 |                                                  gw.transition_probability,
 54 |                                                  ground_reward,
 55 |                                                  discount).argmax(axis=1)
 56 |     trajectories = gw.generate_trajectories(n_samples,
 57 |                                             trajectory_length,
 58 |                                             optimal_policy.take)
 59 |     p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
 60 |                      trajectories.shape[0])
 61 | 
 62 |     # True value.
 63 |     optimal_V = value_iteration.optimal_value(gw.n_states,
 64 |                                               gw.n_actions,
 65 |                                               gw.transition_probability,
 66 |                                               ground_reward, gw.discount)
 67 | 
 68 |     # MaxEnt reward; policy; value.
 69 |     maxent_reward = deep_maxent.irl((feature_matrix.shape[1],),
 70 |                                     feature_matrix,
 71 |                                     gw.n_actions,
 72 |                                     gw.discount,
 73 |                                     gw.transition_probability,
 74 |                                     trajectories, epochs, learning_rate)
 75 | 
 76 |     maxent_policy = value_iteration.find_policy(gw.n_states,
 77 |                                                 gw.n_actions,
 78 |                                                 gw.transition_probability,
 79 |                                                 maxent_reward,
 80 |                                                 discount).argmax(axis=1)
 81 |     maxent_V = value_iteration.value(maxent_policy,
 82 |                                      gw.n_states,
 83 |                                      gw.transition_probability,
 84 |                                      ground_reward,
 85 |                                      gw.discount)
 86 |     maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)
 87 | 
 88 |     # DeepMaxEnt reward; policy; value.
 89 |     deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure,
 90 |                                          feature_matrix,
 91 |                                          gw.n_actions,
 92 |                                          gw.discount,
 93 |                                          gw.transition_probability,
 94 |                                          trajectories, epochs, learning_rate)
 95 |     deep_maxent_policy = value_iteration.find_policy(gw.n_states,
 96 |                                                      gw.n_actions,
 97 |                                                      gw.transition_probability,
 98 |                                                      deep_maxent_reward,
 99 |                                                      discount).argmax(axis=1)
100 |     deep_maxent_V = value_iteration.value(deep_maxent_policy,
101 |                                           gw.n_states,
102 |                                           gw.transition_probability,
103 |                                           ground_reward,
104 |                                           gw.discount)
105 |     deep_maxent_EVD = (optimal_V.dot(p_start_state) -
106 |                        deep_maxent_V.dot(p_start_state))
107 | 
108 |     plt.subplot(3, 3, 1)
109 |     plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
110 |     plt.title("Groundtruth reward")
111 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
112 |                     bottom=False, top=False, left=False, right=False,
113 |                     labelright=False)
114 |     plt.subplot(3, 3, 2)
115 |     plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
116 |     plt.title("MaxEnt reward")
117 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
118 |                     bottom=False, top=False, left=False, right=False,
119 |                     labelright=False)
120 |     plt.subplot(3, 3, 3)
121 |     plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
122 |     plt.title("DeepMaxEnt reward")
123 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
124 |                     bottom=False, top=False, left=False, right=False,
125 |                     labelright=False)
126 | 
127 |     plt.subplot(3, 3, 4)
128 |     plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
129 |     plt.title("Optimal policy")
130 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
131 |                     bottom=False, top=False, left=False, right=False,
132 |                     labelright=False)
133 |     plt.subplot(3, 3, 5)
134 |     plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
135 |     plt.title("MaxEnt policy")
136 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
137 |                     bottom=False, top=False, left=False, right=False,
138 |                     labelright=False)
139 |     plt.subplot(3, 3, 6)
140 |     plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
141 |                vmin=0, vmax=3)
142 |     plt.title("DeepMaxEnt policy")
143 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
144 |                     bottom=False, top=False, left=False, right=False,
145 |                     labelright=False)
146 | 
147 |     plt.subplot(3, 3, 7)
148 |     plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
149 |     plt.title("Optimal value")
150 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
151 |                     bottom=False, top=False, left=False, right=False,
152 |                     labelright=False)
153 |     plt.subplot(3, 3, 8)
154 |     plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
155 |     plt.title("MaxEnt value")
156 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
157 |                     bottom=False, top=False, left=False, right=False,
158 |                     labelright=False)
159 |     plt.subplot(3, 3, 9)
160 |     plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
161 |     plt.title("DeepMaxEnt value")
162 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
163 |                     bottom=False, top=False, left=False, right=False,
164 |                     labelright=False)
165 |     plt.savefig("{}_{}_{}_{}gridworld{}.png".format(grid_size, feature_map,
166 |         n_samples, epochs, structure, np.random.randint(10000000)))
167 | 
168 | 
169 |     return maxent_EVD, deep_maxent_EVD
170 | 
171 | def test_ow_once(grid_size, n_objects, n_colours, discrete, l1, l2, n_samples,
172 |                  epochs, structure):
173 |     """
174 |     Test MaxEnt and DeepMaxEnt on a ow of size grid_size with the feature
175 |     map feature_map with n_samples paths.
176 | 
177 |     grid_size: Grid size. int.
178 |     n_objects: Number of objects. int.
179 |     n_colours: Number of colours. int.
180 |     discrete: Whether the features should be discrete. bool.
181 |     l1: L1 regularisation. float.
182 |     l2: L2 regularisation. float.
183 |     n_samples: Number of paths to sample.
184 |     epochs: Number of epochs to run MaxEnt with.
185 |     structure: Neural network structure tuple, e.g. (3, 3) would be a
186 |         3-layer neural network with assumed inputs.
187 |     -> Expected value difference for MaxEnt, DeepMaxEnt
188 |     """
189 | 
190 |     # Basic gist of what we're doing here: Get the reward function using our
191 |     # different IRL methods, use those to get a policy, evaluate that policy
192 |     # using the true reward, and then return the difference in expected values.
193 | 
194 |     # Setup parameters.
195 |     wind = 0.3
196 |     discount = 0.9
197 |     learning_rate = 0.01
198 |     trajectory_length = 3*grid_size
199 | 
200 |     # Make the objectworld and associated data.
201 |     ow = Objectworld(grid_size, n_objects, n_colours, wind, discount)
202 |     feature_matrix = ow.feature_matrix(discrete)
203 |     ground_reward = np.array([ow.reward(i) for i in range(ow.n_states)])
204 |     optimal_policy = value_iteration.find_policy(ow.n_states,
205 |                                                  ow.n_actions,
206 |                                                  ow.transition_probability,
207 |                                                  ground_reward,
208 |                                                  discount).argmax(axis=1)
209 |     trajectories = ow.generate_trajectories(n_samples,
210 |                                             trajectory_length,
211 |                                             optimal_policy.take)
212 |     p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
213 |                      trajectories.shape[0])
214 | 
215 |     # True value.
216 |     optimal_V = value_iteration.optimal_value(ow.n_states,
217 |                                               ow.n_actions,
218 |                                               ow.transition_probability,
219 |                                               ground_reward, ow.discount)
220 | 
221 |     # MaxEnt reward; policy; value.
222 |     maxent_reward = deep_maxent.irl((feature_matrix.shape[1],),
223 |                                     feature_matrix,
224 |                                     ow.n_actions,
225 |                                     ow.discount,
226 |                                     ow.transition_probability,
227 |                                     trajectories, epochs, learning_rate,
228 |                                     l1=l1, l2=l2)
229 | 
230 |     maxent_policy = value_iteration.find_policy(ow.n_states,
231 |                                                 ow.n_actions,
232 |                                                 ow.transition_probability,
233 |                                                 maxent_reward,
234 |                                                 discount).argmax(axis=1)
235 |     maxent_V = value_iteration.value(maxent_policy,
236 |                                      ow.n_states,
237 |                                      ow.transition_probability,
238 |                                      ground_reward,
239 |                                      ow.discount)
240 |     maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)
241 | 
242 |     # DeepMaxEnt reward; policy; value.
243 |     deep_learning_rate = 0.005 # For the 32 x 32 experiments.
244 |     deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure,
245 |                                          feature_matrix,
246 |                                          ow.n_actions,
247 |                                          ow.discount,
248 |                                          ow.transition_probability,
249 |                                          trajectories, epochs,
250 |                                          deep_learning_rate,
251 |                                          l1=l1, l2=l2)
252 | 
253 |     deep_maxent_policy = value_iteration.find_policy(ow.n_states,
254 |                                                      ow.n_actions,
255 |                                                      ow.transition_probability,
256 |                                                      deep_maxent_reward,
257 |                                                      discount).argmax(axis=1)
258 |     deep_maxent_V = value_iteration.value(deep_maxent_policy,
259 |                                           ow.n_states,
260 |                                           ow.transition_probability,
261 |                                           ground_reward,
262 |                                           ow.discount)
263 | 
264 |     deep_maxent_EVD = (optimal_V.dot(p_start_state) -
265 |                        deep_maxent_V.dot(p_start_state))
266 | 
267 |     plt.subplot(3, 3, 1)
268 |     plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
269 |     plt.title("Groundtruth reward")
270 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
271 |         bottom=False, top=False, left=False, right=False, labelright=False)
272 |     plt.subplot(3, 3, 2)
273 |     plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
274 |     plt.title("MaxEnt reward")
275 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
276 |         bottom=False, top=False, left=False, right=False, labelright=False)
277 |     plt.subplot(3, 3, 3)
278 |     plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
279 |     plt.title("DeepMaxEnt reward")
280 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
281 |         bottom=False, top=False, left=False, right=False, labelright=False)
282 | 
283 |     plt.subplot(3, 3, 4)
284 |     plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
285 |     plt.title("Optimal policy")
286 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
287 |         bottom=False, top=False, left=False, right=False, labelright=False)
288 |     plt.subplot(3, 3, 5)
289 |     plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
290 |     plt.title("MaxEnt policy")
291 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
292 |         bottom=False, top=False, left=False, right=False, labelright=False)
293 |     plt.subplot(3, 3, 6)
294 |     plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
295 |                vmin=0, vmax=3)
296 |     plt.title("DeepMaxEnt policy")
297 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
298 |         bottom=False, top=False, left=False, right=False, labelright=False)
299 | 
300 |     plt.subplot(3, 3, 7)
301 |     plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
302 |     plt.title("Optimal value")
303 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
304 |         bottom=False, top=False, left=False, right=False, labelright=False)
305 |     plt.subplot(3, 3, 8)
306 |     plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
307 |     plt.title("MaxEnt value")
308 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
309 |         bottom=False, top=False, left=False, right=False, labelright=False)
310 |     plt.subplot(3, 3, 9)
311 |     plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
312 |     plt.title("DeepMaxEnt value")
313 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
314 |         bottom=False, top=False, left=False, right=False, labelright=False)
315 |     plt.savefig("{}_{}_{}_{}_{}_{}_{}_{}_{}_objectworld_{}.png".format(
316 |         grid_size, n_objects, n_colours, discrete, n_samples, epochs, structure,
317 |         l1, l2, np.random.randint(10000000)))
318 | 
319 |     return maxent_EVD, deep_maxent_EVD
320 | 
321 | def test_gw_over_samples(grid_size, feature_map, epochs, structure, n):
322 |     """
323 |     Test MaxEnt and DeepMaxEnt on a gridworld of size grid_size with the feature
324 |     map feature_map with different numbers of paths.
325 | 
326 |     grid_size: Grid size. int.
327 |     feature_map: Which feature map to use. String in {ident, coord, proxi}.
328 |     epochs: MaxEnt iterations. int.
329 |     structure: Neural network structure tuple, e.g. (3, 3) would be a
330 |         3-layer neural network with assumed inputs.
331 |     n: Iterations. int.
332 |     -> (MaxEnt [(n_samples, mean expected value difference, stdev)],
333 |         DeepMaxEnt [(n_samples, mean expected value difference, stdev)]),
334 |        raw data (maxent_data, deep_maxent_data)
335 |     """
336 | 
337 |     maxent_data = []
338 |     deep_maxent_data = []
339 |     for n_samples in (32,):
340 |         t = time()
341 |         maxent_EVDs = []
342 |         deep_maxent_EVDs = []
343 |         for i in range(n):
344 |             print("{}: {}/{}".format(n_samples, i+1, n))
345 |             maxent_EVD, deep_maxent_EVD = test_gw_once(grid_size, feature_map,
346 |                                                        n_samples, epochs,
347 |                                                        structure)
348 |             maxent_EVDs.append(maxent_EVD)
349 |             deep_maxent_EVDs.append(deep_maxent_EVD)
350 |             print(maxent_EVD, deep_maxent_EVD)
351 |             stdout.flush()
352 |         maxent_data.append((n_samples, np.mean(maxent_EVDs),
353 |                            np.std(maxent_EVDs)))
354 |         deep_maxent_data.append((n_samples, np.mean(deep_maxent_EVDs),
355 |                                 np.std(deep_maxent_EVDs)))
356 |         print("{} (took {:.02}s)".format(n_samples, time() - t))
357 |         print("MaxEnt:", maxent_data)
358 |         print("DeepMaxEnt:", deep_maxent_data)
359 |     return maxent_data, deep_maxent_data
360 | 
361 | def test_ow_over_samples(grid_size, n_objects, n_colours, discrete, l1, l2,
362 |                          epochs, structure, n):
363 |     """
364 |     Test MaxEnt and DeepMaxEnt on an objectworld with different numbers of paths.
365 | 
366 |     grid_size: Grid size. int.
367 |     n_objects: Number of objects. int.
368 |     n_colours: Number of colours. int.
369 |     discrete: Whether the features should be discrete. bool.
370 |     feature_map: Which feature map to use. String in {ident, coord, proxi}.
371 |     l1: L1 regularisation. float.
372 |     l2: L2 regularisation. float.
373 |     epochs: MaxEnt iterations. int.
374 |     structure: Neural network structure tuple, e.g. (3, 3) would be a
375 |         3-layer neural network with assumed inputs.
376 |     n: Iterations. int.
377 |     -> (MaxEnt [(n_samples, mean expected value difference, stdev)],
378 |         DeepMaxEnt [(n_samples, mean expected value difference, stdev)]),
379 |        raw data (maxent_data, deep_maxent_data)
380 |     """
381 | 
382 |     maxent_data = []
383 |     deep_maxent_data = []
384 |     for n_samples in (32, 16, 8, 4):
385 |         t = time()
386 |         maxent_EVDs = []
387 |         deep_maxent_EVDs = []
388 |         for i in range(n):
389 |             print("{}: {}/{}".format(n_samples, i+1, n))
390 |             maxent_EVD, deep_maxent_EVD = test_ow_once(grid_size, n_objects,
391 |                 n_colours, discrete, l1, l2, n_samples, epochs, structure)
392 |             maxent_EVDs.append(maxent_EVD)
393 |             deep_maxent_EVDs.append(deep_maxent_EVD)
394 |             print(maxent_EVD, deep_maxent_EVD)
395 |             stdout.flush()
396 |         maxent_data.append((n_samples, np.mean(maxent_EVDs),
397 |             np.median(maxent_EVDs), np.std(maxent_EVDs)))
398 |         deep_maxent_data.append((n_samples, np.mean(deep_maxent_EVDs),
399 |             np.median(deep_maxent_EVDs), np.std(deep_maxent_EVDs)))
400 |         print("{} (took {:.02}s)".format(n_samples, time() - t))
401 |         print("MaxEnt:", maxent_data)
402 |         print("DeepMaxEnt:", deep_maxent_data)
403 |     return maxent_data, deep_maxent_data
404 | 
405 | if __name__ == '__main__':
406 |     # Tests the 16 x 16 objectworld.
407 |     print(test_ow_over_samples(16, 25, 2, False, 0, 0, 150, (3, 3), 10))
408 |     # Tests the 32 x 32 objectworld.
409 |     print(test_ow_over_samples(32, 50, 2, False, 0, 0, 250, (3, 3), 5))


--------------------------------------------------------------------------------
/hierarchicalrl/options_grid_world.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements the options gridworld MDP.
  3 | 
  4 | Todor Davchev, 2017
  5 | t.b.davchev@ed.ac.uk
  6 | """
  7 | 
  8 | import numpy as np
  9 | import numpy.random as rn
 10 | 
 11 | 
 12 | class Large_Gridworld(object):
 13 |     """
 14 |     Gridworld MDP.
 15 |     """
 16 | 
 17 |     def __init__(self, grid_size, walls, options, rooms, wind, discount):
 18 |         """
 19 |         grid_size: Grid size. int.
 20 |         wind: Chance of moving randomly. float.
 21 |         discount: MDP discount. float.
 22 |         -> Gridworld
 23 |         """
 24 |         self.count = 0
 25 |         self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1))
 26 |         self.options = options
 27 |         self.rooms = rooms
 28 |         self.n_actions = len(self.actions)
 29 |         self.n_states = grid_size**2
 30 |         self.grid_size = grid_size
 31 |         self.wind = wind
 32 |         self.discount = discount
 33 |         self.walls = walls
 34 |         self.init_states = [(5, 2), (1, 5), (8, 6), (5, 9)]
 35 |         self.term_states = [(5, 2), (1, 5), (8, 6), (5, 9)] 
 36 |         self.n_options = len(self.options)
 37 |         self.k_length = 10
 38 |         self.gamma = 0.9
 39 |         self.count = 0
 40 |         # Preconstruct the transition probability array.
 41 |         # self.transition_probability = np.array(
 42 |         #     [[[self._transition_probability(i, j, k)
 43 |         #        for k in range(self.n_states)]
 44 |         #       for j in range(self.n_actions)]
 45 |         #      for i in range(self.n_states)])
 46 | 
 47 |         # mislq che e greshno
 48 |         # self.reward_o = np.array(
 49 |         #     [self._reward_o(option)
 50 |         #      for option in self.options[:2]]
 51 |         # )
 52 | 
 53 |         # Preconstruct the transition probability array.
 54 |         self.improved_transition_probability = np.array(
 55 |             [[[[self._improved_transition_probability(o, i, j, k)
 56 |                 for k in range(self.n_states)]
 57 |                for j in range(self.n_actions)]
 58 |               for i in range(self.n_states)]
 59 |              for o in range(self.n_options)])
 60 |             #  for o in range(self.n_options)])
 61 | 
 62 |         # Preconstruct the transition probability array.
 63 |         # after done, initial states are all within a room
 64 |         # should factor in sudden change of option ? - yes
 65 |         # self.options_transition_probability = np.array(
 66 |         #     [[[self._options_transition_probability(i, j, k)
 67 |         #        for k in self.init_states]
 68 |         #       for j in range(self.n_options)]
 69 |         #      for i in self.init_states])
 70 |         states_per_option = [[state for state in self.rooms[opt["room"]]] for opt in self.options]
 71 |         self.options_transition_probability = np.zeros((121, 8, 121))
 72 |         for opt in self.options:
 73 |             print opt["id"]
 74 |             if opt["id"] == 4:
 75 |                 print "pochna se"
 76 |             za = [
 77 |                 [
 78 |                     self._options_transition_probability(i, opt["id"], k)
 79 |                     for k in states_per_option[opt["id"]]
 80 |                 ]
 81 |                 for i in states_per_option[opt["id"]]
 82 |             ]
 83 |             print "compiuted transition prob"
 84 |             for br, state in enumerate(states_per_option[opt["id"]]):
 85 |                 for br_2, state_k in enumerate(states_per_option[opt["id"]]):
 86 |                     self.options_transition_probability[state][opt["id"]][state_k] = za[br][br_2]
 87 | 
 88 |             print "izlizam ot option.."
 89 |         print "Done."
 90 | 
 91 |         # self.options_transition_probability = np.array(
 92 |         #     [[[self._options_transition_probability(i, o, k)
 93 |         #        for k in states_per_option]#[term for term in self.term_states]]
 94 |         #       for o in range(self.n_options)]
 95 |         #      for i in states_per_option])
 96 | 
 97 |     def __str__(self):
 98 |         return "Gridworld({}, {}, {})".format(self.grid_size, self.wind,
 99 |                                               self.discount)
100 | 
101 |     # def _reward_o(self, option):
102 |     #     option_states = self.rooms[option["room"]]
103 |     #     reward_o = [0 for _ in option_states]
104 |     #     for idx, state in enumerate(option_states):
105 |     #         k_step = 1
106 |     #         reward_o[idx] = self.reward(state)
107 |     #         sx, sy = self.int_to_point(state)
108 |     #         # current_state = (sx, sy)
109 |     #         while k_step < self.k_length and (sx, sy) != option["termination"]:
110 |     #             action = self.actions[self.intra_option_optimal_policy(self.point_to_int((sx, sy)), option["id"])]
111 | 
112 |     #             if ((((sx + action[0],
113 |     #                    sy + action[1]) == (5, 2) and option["id"] == 0)
114 |     #                  or (sx + action[0],
115 |     #                      sy + action[1]) == (1, 5) and option["id"] == 1) or
116 |     #                     (0 <= sx + action[0] < 5 and#self.grid_size and
117 |     #                      0 <= sy + action[1] < 5)):#self.grid_size):
118 |     #                 sx = sx + action[0]
119 |     #                 sy = sy + action[1]
120 | 
121 |     #             reward_o[idx] += self.reward(self.point_to_int((sx, sy))) * np.power(self.gamma, k_step)
122 |     #             k_step += 1
123 |     #     return reward_o
124 | 
125 |     def feature_vector(self, i, vec_size, feature_map="ident"):
126 |         """
127 |         Get the feature vector associated with a state integer.
128 | 
129 |         i: State int.
130 |         feature_map: Which feature map to use (default ident). String in {ident,
131 |             coord, proxi}.
132 |         -> Feature vector.
133 |         """
134 | 
135 |         if feature_map == "coord":
136 |             f = np.zeros(self.grid_size)
137 |             x, y = i % self.grid_size, i // self.grid_size
138 |             f[x] += 1
139 |             f[y] += 1
140 |             return f
141 |         if feature_map == "proxi":
142 |             f = np.zeros(self.n_states)
143 |             x, y = i % self.grid_size, i // self.grid_size
144 |             for b in range(self.grid_size):
145 |                 for a in range(self.grid_size):
146 |                     dist = abs(x - a) + abs(y - b)
147 |                     f[self.point_to_int((a, b))] = dist
148 |             return f
149 |         # Assume identity map.
150 |         f = np.zeros(vec_size)
151 |         f[i] = 1
152 |         return f
153 | 
154 |     def o_feature_matrix(self, feature_map="ident"):
155 |         """
156 |         Get the feature matrix for this gridworld.
157 | 
158 |         feature_map: Which feature map to use (default ident). String in {ident,
159 |             coord, proxi}.
160 |         -> NumPy array with shape (n_states, d_states).
161 |         """
162 |         features = []
163 |         for n in range(self.n_options):
164 |             f = self.feature_vector(n, self.n_options, feature_map)
165 |             features.append(f)
166 |         return np.array(features)
167 | 
168 |     def feature_matrix(self, feature_map="ident"):
169 |         """
170 |         Get the feature matrix for this gridworld.
171 | 
172 |         feature_map: Which feature map to use (default ident). String in {ident,
173 |             coord, proxi}.
174 |         -> NumPy array with shape (n_states, d_states).
175 |         """
176 | # [
177 | #             np.zeros(len(self.rooms[self.options[o]["room"]]))
178 | #             for o in range(self.options)]
179 |         features = [
180 |             [np.zeros(len(self.rooms[self.options[o]["room"]]))
181 |              for __ in range(len(self.rooms[self.options[o]["room"]]))]
182 |             for o in range(len(self.options))]
183 |         # features = np.reshape(features, [len(self.options), self.n_states, self.n_states])
184 |         for o in range(len(self.options)):
185 |             for n in range(len(features[o])):
186 |                 # if self.int_to_point(n) not in self.walls: # redundant
187 |                     # idx = [br for br, room in enumerate(self.rooms) if n in room][0]
188 |                     # f = self.feature_vector(n, feature_map)
189 |                     # features[idx][n] = f
190 |                 f = self.feature_vector(n, len(self.rooms[self.options[o]["room"]]), feature_map)
191 |                 features[o][n] = f
192 |         return np.array(features)
193 | 
194 |     def opt_to_point(self, i):
195 |         """
196 |         Convert an option int into the corresponding coordinate.
197 | 
198 |         i: option int.
199 |         -> (x, y) int tuple.
200 |         """
201 | 
202 |         return self.options[i]["init_set"]
203 | 
204 |     def point_to_opt(self, p):
205 |         """
206 |         Convert a coordinate into the corresponding state options list.
207 | 
208 |         p: (x, y) tuple.
209 |         -> State int.
210 |         """
211 | 
212 |         return [x for x in self.options if x["init_set"] == p]
213 | 
214 |     def int_to_point(self, i):
215 |         """
216 |         Convert a state int into the corresponding coordinate.
217 | 
218 |         i: State int.
219 |         -> (x, y) int tuple.
220 |         """
221 | 
222 |         return (i % self.grid_size, i // self.grid_size)
223 | 
224 |     def point_to_int(self, p):
225 |         """
226 |         Convert a coordinate into the corresponding state int.
227 | 
228 |         p: (x, y) tuple.
229 |         -> State int.
230 |         """
231 | 
232 |         return p[0] + p[1] * self.grid_size
233 | 
234 |     def isa_wall(self, i):
235 |         """
236 |         Get whether a point is a wall or not. Returns True if wall.
237 | 
238 |         i: (x, y) int tuple.
239 |         -> bool.
240 |         """
241 | 
242 |         return i in self.walls
243 | 
244 |     def neighbouring_option_states(self, i, k):
245 |         """
246 |         Get whether two options neighbour each other. Also returns true if they
247 |         are the same options.
248 | 
249 |         i: (x, y) int tuple.
250 |         k: (x, y) int tuple.
251 |         -> bool.
252 |         """
253 | 
254 |         return len([x for x in self.options if x["termination"] == i and x["init_set"] == k]) > 0
255 | 
256 |     def neighbouring(self, i, k):
257 |         """
258 |         Get whether two points neighbour each other. Also returns true if they
259 |         are the same point.
260 | 
261 |         i: (x, y) int tuple.
262 |         k: (x, y) int tuple.
263 |         -> bool.
264 |         """
265 | 
266 |         return abs(i[0] - k[0]) + abs(i[1] - k[1]) <= 1
267 | 
268 |     def insame_room(self, i, k):
269 |         """
270 |         Get whether two points are in the same room. Also returns true if they
271 |         are the same point.
272 | 
273 |         i: int.
274 |         k: int.
275 |         -> [room id].
276 |         """
277 | 
278 |         item_one = np.asarray([br for br, x in enumerate([i in room for room in self.rooms]) if x])
279 |         item_two = [br for br, x in enumerate([k in room for room in self.rooms]) if x]
280 |         mask = np.in1d(item_one, item_two)
281 | 
282 |         return item_one[mask]
283 | 
284 |         # return [item for item in np.in1d(item_one, item_two) if item]
285 | 
286 |     #  def _options_transition_probability(self, o, i, j, k):
287 |     #     """
288 |     #     Get the probability of transitioning from state i to state k given
289 |     #     action j.
290 | 
291 |     #     maybe start with option_state, option, option_state
292 |     #     if possible to get there, if the option state is the actual goal
293 |     #     assign 1 - wind, otherwise it should be 50% ?
294 | 
295 |     #     i: Option State int.
296 |     #     j: Action int.
297 |     #     k: State int.
298 |     #     -> p(s_k | s_i, a_j)
299 |     #     """
300 |     #     options_i = self.point_to_opt(i)
301 |     #     # option_id = [br for br, x in enumerate(self.init_states) if x == i][0]
302 |     #     option_action = self.options[j]
303 |     #     # option_kd = [br for br, x in enumerate(self.init_states) if x == k][0]
304 |     #     options_k = self.point_to_opt(k)
305 | 
306 |     #     if i != option_action["init_set"]:
307 |     #         if i == k:
308 |     #             return 1.0
309 |     #         else:
310 |     #             return 0.0
311 | 
312 |     #     if i == k:
313 |     #         return 0.0
314 | 
315 |     #     if i == option_action["init_set"]:
316 |     #         if k == option_action["termination"]:
317 |     #             return 1 - self.wind
318 | 
319 |     #         else:
320 |     #             s = [x for x in options_i if x["termination"]
321 |     #                  != option_action["termination"]]
322 |     #             for option in s:
323 |     #                 if option["termination"] == k:
324 |     #                     return self.wind / len(s)
325 | 
326 |     #             return 0.0
327 | 
328 |     def _options_transition_probability(self, i, o, k):
329 |         """
330 |         Get the probability of transitioning from state i to state k given
331 |         action j.
332 | 
333 |         maybe start with option_state, option, option_state
334 |         if possible to get there, if the option state is the actual goal
335 |         assign 1 - wind, otherwise it should be 50% ?
336 | 
337 |         i: Option State int.
338 |         j: Action int.
339 |         k: State int.
340 |         -> p(s_k | s_i, a_j)
341 |         """
342 | 
343 |         xi, yi = self.int_to_point(i)
344 |         xk, yk = self.int_to_point(k)
345 | 
346 |         if (xk, yk) != self.options[o]["termination"]:
347 |             return 0
348 | 
349 |         room_no = np.asarray(self.insame_room(i, self.point_to_int((xk, yk))))
350 | 
351 |         if self.options[o]["room"] not in room_no:
352 |             return 0.0
353 | 
354 |         k_step = 0
355 |         # current_state = (sx, sy)
356 |         while (xi, yi) != self.options[o]["termination"]:
357 |             action = self.actions[self.intra_option_optimal_policy(
358 |                 self.point_to_int((xi, yi)), self.options[o]["id"])]
359 | 
360 |             if (((xi + action[0],
361 |                   yi + action[1]) == self.options[o]["termination"]) or (
362 |                       self.options[o]["min"][0] < xi + action[0] < self.options[o]["max"][0] and
363 |                       self.options[o]["min"][1] < yi + action[1] < self.options[o]["max"][1])
364 |                ):
365 |                 xi = xi + action[0]
366 |                 yi = yi + action[1]
367 | 
368 |             k_step += 1
369 |             if k_step > 8:
370 |                 print "wtf"
371 | 
372 |         return np.power(self.gamma, k_step)
373 | 
374 |     def _improved_transition_probability(self, o, i, j, k):
375 |         """
376 |         Get the probability of transitioning from state i to state k given
377 |         action j.
378 | 
379 |         i: State int.
380 |         j: Action int.
381 |         k: State int.
382 |         -> p(s_k | s_i, a_j)
383 |         """
384 | 
385 |         xi, yi = self.int_to_point(i)
386 |         xj, yj = self.actions[j]
387 |         xk, yk = self.int_to_point(k)
388 | 
389 |         room_no = np.asarray(self.insame_room(i, k))
390 | 
391 |         if len(room_no) < 1:
392 |             return 0.0
393 | 
394 |         if self.options[o]["room"] not in room_no:
395 |             return 0.0
396 | 
397 |         if not self.neighbouring((xi, yi), (xk, yk)):
398 |             return 0.0
399 | 
400 |         if self.isa_wall((xi, yi)):
401 |             return 0.0
402 | 
403 |         if self.isa_wall((xk, yk)):
404 |             return 0.0
405 | 
406 |         # Is k the intended state to move to?
407 |         if (xi + xj, yi + yj) == (xk, yk):
408 |             return 1 - self.wind + self.wind / self.n_actions
409 | 
410 |         # If these are not the same point, then we can move there by wind.
411 |         if (xi, yi) != (xk, yk):
412 |             return self.wind / self.n_actions
413 | 
414 |         # If these are the same point, we can only move here by either moving
415 |         # off the grid or being blown off the grid. Are we on a corner or not?
416 |         if (xi, yi) in {(0, 0), (self.grid_size - 1, self.grid_size - 1),
417 |                         (0, self.grid_size - 1), (self.grid_size - 1, 0),
418 |                         (4, 0), (6, 0), (6, 5), (4, 4), (0, 4), (0, 6), (4, 6),
419 |                         (10, 5), (6, 7), (10, 7), (4, 10), (6, 10)}:
420 |             # Corner.
421 |             # Can move off the edge in two directions.
422 |             # Did we intend to move off the grid?
423 |             if not ((0 <= xi + xj < self.grid_size and
424 |                      0 <= yi + yj < self.grid_size) and
425 |                     not self.isa_wall((xi + xj, yi + yj))):
426 |                 # We intended to move off the grid, so we have the regular
427 |                 # success chance of staying here plus an extra chance of blowing
428 |                 # onto the *other* off-grid square.
429 |                 return 1 - self.wind + 2 * self.wind / self.n_actions
430 |             else:
431 |                 # We can blow off the grid in either direction only by wind.
432 |                 return 2 * self.wind / self.n_actions
433 |         elif (xi, yi) in {self.int_to_point(27), self.int_to_point(56),
434 |                           self.int_to_point(74), self.int_to_point(104)}:
435 |             if not ((0 <= xi + xj < self.grid_size and
436 |                      0 <= yi + yj < self.grid_size) and
437 |                     not self.isa_wall((xi + xj, yi + yj))):
438 | 
439 |                 if (xi, yi) in self.init_states:
440 |                     return 1 - self.wind/self.n_actions
441 |                 # We intended to move off the grid, so we have the regular
442 |                 # success chance of staying here plus an extra chance of blowing
443 |                 # onto the *other* off-grid square.
444 |                 return 1 - self.wind + 2 * self.wind / self.n_actions
445 |             else:
446 |                 if (xi, yi) in self.init_states:
447 |                     should_go = np.asarray(self.insame_room(
448 |                         self.point_to_int((xi, yi)),
449 |                         self.point_to_int((xi + xj, yi + yj))))
450 |                     if len(should_go) > 0:
451 |                         if should_go[0] == o:
452 |                             return self.wind - self.wind / self.n_actions
453 | 
454 |                     return 1 - self.wind/self.n_actions
455 | 
456 |                 # We can blow off the grid in either direction only by wind.
457 |                 return 2 * self.wind / self.n_actions
458 |         else:
459 |             # Not a corner. Is it an edge?
460 |             if (xi not in {0, self.grid_size - 1} and
461 |                     yi not in {0, self.grid_size - 1} and
462 |                     (xi, yi) not in {
463 |                         self.int_to_point(15), self.int_to_point(
464 |                             37), self.int_to_point(17),
465 |                         self.int_to_point(39), self.int_to_point(
466 |                             50), self.int_to_point(62),
467 |                         self.int_to_point(64), self.int_to_point(
468 |                             86), self.int_to_point(84),
469 |                         self.int_to_point(94), self.int_to_point(
470 |                             92), self.int_to_point(81),
471 |                         self.int_to_point(69), self.int_to_point(
472 |                             68), self.int_to_point(46),
473 |                         self.int_to_point(47)
474 |             }
475 |             ):
476 |                 # Not an edge.
477 |                 return 0.0
478 | 
479 |             # Edge.
480 |             # Can only move off the edge in one direction.
481 |             # Did we intend to move off the grid?
482 |             if not (0 <= xi + xj < self.grid_size and
483 |                     0 <= yi + yj < self.grid_size and
484 |                     not self.isa_wall((xi + xj, yi + yj))):
485 |                 # We intended to move off the grid, so we have the regular
486 |                 # success chance of staying here.
487 |                 return 1 - self.wind + self.wind / self.n_actions
488 |             else:
489 |                 # We can blow off the grid only by wind.
490 |                 return self.wind / self.n_actions
491 | 
492 |     def _transition_probability(self, i, j, k):
493 |         """
494 |         Get the probability of transitioning from state i to state k given
495 |         action j.
496 | 
497 |         i: State int.
498 |         j: Action int.
499 |         k: State int.
500 |         -> p(s_k | s_i, a_j)
501 |         """
502 | 
503 |         xi, yi = self.int_to_point(i)
504 |         xj, yj = self.actions[j]
505 |         xk, yk = self.int_to_point(k)
506 | 
507 |         if not self.neighbouring((xi, yi), (xk, yk)):
508 |             return 0.0
509 | 
510 |         if self.isa_wall((xi, yi)):
511 |             return 0.0
512 | 
513 |         if self.isa_wall((xk, yk)):
514 |             return 0.0
515 | 
516 |         # Is k the intended state to move to?
517 |         if (xi + xj, yi + yj) == (xk, yk):
518 |             return 1 - self.wind + self.wind / self.n_actions
519 | 
520 |         # If these are not the same point, then we can move there by wind.
521 |         if (xi, yi) != (xk, yk):
522 |             return self.wind / self.n_actions
523 | 
524 |         # If these are the same point, we can only move here by either moving
525 |         # off the grid or being blown off the grid. Are we on a corner or not?
526 |         if (xi, yi) in {(0, 0), (self.grid_size - 1, self.grid_size - 1),
527 |                         (0, self.grid_size - 1), (self.grid_size - 1, 0),
528 |                         (4, 0), (6, 0), (6, 5), (4, 4), (0, 4), (0, 6), (4, 6),
529 |                         (10, 5), (6, 7), (10, 7), (4, 10), (6, 10)}:
530 |             # Corner.
531 |             # Can move off the edge in two directions.
532 |             # Did we intend to move off the grid?
533 |             if not ((0 <= xi + xj < self.grid_size and
534 |                      0 <= yi + yj < self.grid_size) and
535 |                     not self.isa_wall((xi + xj, yi + yj))):
536 |                 # We intended to move off the grid, so we have the regular
537 |                 # success chance of staying here plus an extra chance of blowing
538 |                 # onto the *other* off-grid square.
539 |                 return 1 - self.wind + 2 * self.wind / self.n_actions
540 |             else:
541 |                 # We can blow off the grid in either direction only by wind.
542 |                 return 2 * self.wind / self.n_actions
543 |         elif (xi, yi) in {self.int_to_point(27), self.int_to_point(56),
544 |                           self.int_to_point(74), self.int_to_point(104)}:
545 |             if not ((0 <= xi + xj < self.grid_size and
546 |                      0 <= yi + yj < self.grid_size) and
547 |                     not self.isa_wall((xi + xj, yi + yj))):
548 |                 # We intended to move off the grid, so we have the regular
549 |                 # success chance of staying here plus an extra chance of blowing
550 |                 # onto the *other* off-grid square.
551 |                 return 1 - self.wind + 2 * self.wind / self.n_actions
552 |             else:
553 |                 # We can blow off the grid in either direction only by wind.
554 |                 return 2 * self.wind / self.n_actions
555 |         else:
556 |             # Not a corner. Is it an edge?
557 |             if (xi not in {0, self.grid_size - 1} and
558 |                     yi not in {0, self.grid_size - 1} and
559 |                     (xi, yi) not in {
560 |                         self.int_to_point(15), self.int_to_point(
561 |                             37), self.int_to_point(17),
562 |                         self.int_to_point(39), self.int_to_point(
563 |                             50), self.int_to_point(62),
564 |                         self.int_to_point(64), self.int_to_point(
565 |                             86), self.int_to_point(84),
566 |                         self.int_to_point(94), self.int_to_point(
567 |                             92), self.int_to_point(81),
568 |                         self.int_to_point(69), self.int_to_point(
569 |                             68), self.int_to_point(46),
570 |                         self.int_to_point(47)
571 |             }
572 |             ):
573 |                 # Not an edge.
574 |                 return 0.0
575 | 
576 |             # Edge.
577 |             # Can only move off the edge in one direction.
578 |             # Did we intend to move off the grid?
579 |             if not (0 <= xi + xj < self.grid_size and
580 |                     0 <= yi + yj < self.grid_size and
581 |                     not self.isa_wall((xi + xj, yi + yj))):
582 |                 # We intended to move off the grid, so we have the regular
583 |                 # success chance of staying here.
584 |                 return 1 - self.wind + self.wind / self.n_actions
585 |             else:
586 |                 # We can blow off the grid only by wind.
587 |                 return self.wind / self.n_actions
588 | 
589 |     # def reward(self, state_int, option):
590 |     #     """
591 |     #     Reward for being in state state_int.
592 | 
593 |     #     state_int: State integer. int.
594 |     #     -> Reward.
595 |     #     """
596 |     #     if option == 0 and state_int == 27:#self.n_states - 1:  # self.point_to_int((8, 6)):
597 |     #         return 1
598 |     #     elif option == 1 and state_int == 56:
599 |     #         return 1
600 |     #     return 0
601 | 
602 |     def reward(self, state_int):
603 |         """
604 |         Reward for being in state state_int.
605 | 
606 |         state_int: State integer. int.
607 |         -> Reward.
608 |         """
609 |         if state_int == 74:
610 |             return 1
611 | 
612 |         return 0
613 | 
614 |     def opt_reward(self, opt_int):
615 |         """
616 |         Reward for being in state state_int.
617 | 
618 |         state_int: State integer. int.
619 |         -> Reward.
620 |         """
621 |         if opt_int == 2:
622 |             return 1
623 | 
624 |         return 0
625 | 
626 |     def average_reward(self, n_trajectories, trajectory_length, policy):
627 |         """
628 |         Calculate the average total reward obtained by following a given policy
629 |         over n_paths paths.
630 | 
631 |         policy: Map from state integers to action integers.
632 |         n_trajectories: Number of trajectories. int.
633 |         trajectory_length: Length of an episode. int.
634 |         -> Average reward, standard deviation.
635 |         """
636 | 
637 |         trajectories = self.generate_trajectories(n_trajectories,
638 |                                                   trajectory_length, policy)
639 |         rewards = [[r for _, _, r in trajectory]
640 |                    for trajectory in trajectories]
641 |         rewards = np.array(rewards)
642 | 
643 |         # Add up all the rewards to find the total reward.
644 |         total_reward = rewards.sum(axis=1)
645 | 
646 |         # Return the average reward and standard deviation.
647 |         return total_reward.mean(), total_reward.std()
648 | 
649 |     def option_option_optimal_policy(self, option):
650 |         if option == 0:
651 |             return 2
652 |         if option == 1:
653 |             return 7
654 |         if option == 2:
655 |             return 2
656 |         if option == 3:
657 |             return 2
658 |         if option == 4:
659 |             return 5
660 |         if option == 5:
661 |             return 5
662 |         if option == 6:
663 |             return 7
664 |         if option == 7:
665 |             return 5
666 | 
667 |     def intra_option_optimal_policy(self, state_int, option):
668 |         """
669 |         The optimal policy for this gridworld.
670 | 
671 |         state_int: What state we are in. int.
672 |         Actions: {"right": 0, "down": 1, "left": 2, "up": 3}
673 |         -> Action int.
674 |         """
675 |         sx, sy = self.int_to_point(state_int)
676 | 
677 |         if option == 0:
678 |             if (sx, sy) in [(4, 0), (4, 1), (3, 1)]:
679 |                 return 1
680 |             if (sx, sy) in [(3, 4), (4, 4), (4, 3), (1, 5)]:
681 |                 return 3
682 |             if (sx, sy) in [(0, 0), (1, 0), (2, 0), (3, 0), (0, 1), (1, 1), (2, 1),
683 |                             (0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, 2),
684 |                             (0, 3), (1, 3), (2, 3), (3, 3), (0, 4), (1, 4), (2, 4)]:
685 |                 return 0
686 | 
687 |         elif option == 1:
688 |             if (sx, sy) in [(0, 3), (0, 4)]:
689 |                 return 0
690 |             if (sx, sy) in [(5, 2), (4, 1), (3, 3), (4, 3), (2, 4), (3, 4), (4, 4)]:
691 |                 return 2
692 |             if (sx, sy) in [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0),
693 |                             (0, 1), (1, 1), (2, 1), (3, 1),
694 |                             (0, 2), (1, 2), (2, 2), (3, 2), (4, 2),
695 |                             (1, 3), (2, 3), (1, 4), (1, 5)]:
696 |                 return 1
697 | 
698 |         elif option == 2:
699 |             if (sx, sy) in [(5, 2), (6, 2), (7, 2), (7, 3), (7, 4), (7, 5), (6, 5)]:
700 |                 return 0
701 |             if (sx, sy) in [(9, 2), (10, 2), (10, 3), (10, 4), (10, 5), (9, 4), (9, 5)]:
702 |                 return 2
703 |             if (sx, sy) in [(6, 0), (7, 0), (8, 0), (9, 0), (10, 0),
704 |                             (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (8, 2),
705 |                             (8, 3), (8, 4), (8, 5), (8, 6), (6, 3), (6, 4), (9, 3)]:
706 |                 return 1
707 | 
708 |         elif option == 3:
709 |             if (sx, sy) in [(6, 5), (6, 4), (6, 3),
710 |                             (7, 5), (7, 4), (7, 3),
711 |                             (8, 5), (8, 4), (8, 3), (8, 6),
712 |                             (9, 5), (9, 4), (9, 3),
713 |                             (10, 5), (10, 4), (10, 3)]:
714 |                 return 3
715 |             if (sx, sy) in [(5, 2), (6, 2), (7, 2), (8, 2), (9, 2), (10, 2)]:
716 |                 return 2
717 |             if (sx, sy) in [(6, 0), (7, 0), (8, 0), (9, 0), (10, 0),
718 |                             (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]:
719 |                 return 1
720 | 
721 |         elif option == 4:
722 |             if (sx, sy) in [(6, 7), (7, 7), (8, 7), (9, 7), (10, 7),
723 |                             (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (8, 6)]:
724 |                 return 1
725 |             if (sx, sy) in [(5, 9), (6, 9), (7, 9), (8, 9), (9, 9), (10, 9)]:
726 |                 return 2
727 |             if (sx, sy) in [(6, 10), (7, 10), (8, 10), (9, 10), (10, 10)]:
728 |                 return 3
729 | 
730 |         elif option == 5:
731 |             if (sx, sy) in [(6, 7), (7, 7),
732 |                             (6, 8), (7, 8), (6, 9), (5, 9), (7, 9), (6, 10), (7, 10)]:
733 |                 return 0
734 |             if (sx, sy) in [(9, 9), (10, 9), (9, 10), (10, 10), (9, 8), (10, 8), (9, 7), (10, 7)]:
735 |                 return 2
736 |             if (sx, sy) in [(8, 10), (8, 9), (8, 8), (8, 7), (8, 6)]:
737 |                 return 3
738 | 
739 |         elif option == 6:
740 |             if (sx, sy) in [(0, 6), (0, 7), (0, 8), (0, 9), (0, 10)]:
741 |                 return 0
742 |             if (sx, sy) in [(2, 6), (3, 6), (4, 6),
743 |                             (2, 7), (3, 7), (4, 7),
744 |                             (2, 8), (3, 8), (4, 8),
745 |                             (2, 9), (3, 9), (4, 9),
746 |                             (2, 10), (3, 10), (4, 10), (5, 9)]:
747 |                 return 2
748 |             if (sx, sy) in [(1, 10), (1, 9), (1, 8), (1, 7), (1, 6), (1, 5)]:
749 |                 return 3
750 | 
751 |         elif option == 7:
752 |             if (sx, sy) in [(0, 9), (1, 9), (2, 9), (3, 9), (4, 9), (5, 9)]:
753 |                 return 0
754 |             if (sx, sy) in [(1, 5), (1, 6), (1, 7), (1, 8), (0, 6), (0, 7), (0, 8),
755 |                             (2, 6), (2, 7), (2, 8), (3, 6), (3, 7), (3, 8),
756 |                             (4, 6), (4, 7), (4, 8)]:
757 |                 return 1
758 |             if (sx, sy) in [(0, 10), (1, 10), (2, 10), (3, 10), (4, 10)]:
759 |                 return 3
760 | 
761 |         print state_int
762 |         print option
763 |         print "!!!!!"
764 |         raise ValueError("Unexpected state.")
765 | 
766 |     def optimal_policy(self, state_int):
767 |         """
768 |         The optimal policy for this gridworld.
769 | 
770 |         state_int: What state we are in. int.
771 |         -> Action int.
772 |         """
773 | 
774 |         sx, sy = self.int_to_point(state_int)
775 | 
776 |         if sx < self.grid_size and sy < self.grid_size:
777 |             return rn.randint(0, 2)
778 |         if sx < self.grid_size - 1:
779 |             return 0
780 |         if sy < self.grid_size - 1:
781 |             return 1
782 |         raise ValueError("Unexpected state.")
783 | 
784 |     def optimal_policy_deterministic(self, state_int):
785 |         """
786 |         Deterministic version of the optimal policy for this gridworld.
787 | 
788 |         state_int: What state we are in. int.
789 |         -> Action int.
790 |         """
791 | 
792 |         sx, sy = self.int_to_point(state_int)
793 |         if sx < sy:
794 |             return 0
795 |         return 1
796 | 
797 |     def generate_option_option_trajectories(self, trajectories, n_trajectories, option_policy, action_policy, random_start=False):
798 |         """
799 |         Generate n_trajectories trajectories with length trajectory_length,
800 |         following the given policy.
801 | 
802 |         n_trajectories: Number of trajectories. int.
803 |         trajectory_length: Length of an episode. int.
804 |         policy: Map from state integers to action integers.
805 |         random_start: Whether to start randomly (default False). bool.
806 |         -> [[(state int, action int, reward float)]]
807 |         """
808 | 
809 |         generated_trajectories = []
810 |         for _ in range(n_trajectories):
811 |             if random_start:
812 |                 sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size)
813 |             else:
814 |                 sx, sy = 0, 0
815 | 
816 |             trajectory = []
817 |             option = self.options[0]
818 |             path_idx = np.random.choice(len(trajectories[option["id"]]), 1)
819 |             local_path = trajectories[option["id"]][path_idx][0]
820 |             acc_reward = np.sum(local_path[:, 2])
821 |             states = local_path[:, 0]
822 |             state_int = local_path[-1, 0]
823 |             while self.int_to_point(state_int) != option["termination"]:
824 |                 new_point = self.generate_intra_option_trajectories(
825 |                               1, 2, action_policy, option,
826 |                               predefined_start=self.int_to_point(state_int))[0][1]
827 |                 states = np.append(states, new_point[0])
828 |                 state_int = states[-1]
829 | 
830 |             sx, sy = self.int_to_point(states[-1])
831 |             trajectory.append((states, option["id"], acc_reward))
832 |             for _ in range(len(trajectories[0][0])/2 - 1):
833 |                 states = []
834 |                 if rn.random() < self.wind:
835 |                     # save this for generalised variant
836 |                     # _available_options = [opt['id'] for opt in self.options if self.point_to_int((sx, sy)) in self.rooms[opt["room"]]]
837 |                     _available_options = [opt['id'] for opt in self.options if (sx, sy) == opt["init_set"]]
838 |                     option = self.options[np.random.choice(_available_options, 1)[0]]
839 |                 else:
840 |                     # Follow the given policy.
841 |                     option = self.options[option_policy(option["id"])]
842 | 
843 |                 if (sx, sy) != self.options[2]["termination"]:
844 |                     path_idx = np.random.choice(len(trajectories[option["id"]]), 1)
845 |                     local_path = trajectories[option["id"]][path_idx][0]
846 |                     acc_reward = np.sum(local_path[:, 2])
847 |                     states = local_path[:, 0]
848 |                     state_int = local_path[-1, 0]
849 |                     while self.int_to_point(state_int) != option["termination"]:
850 |                         new_point = self.generate_intra_option_trajectories(
851 |                             1, 2, action_policy, option,
852 |                             predefined_start=self.int_to_point(state_int))[0][1]
853 |                         states = np.append(states, new_point[0])
854 |                         acc_reward += new_point[2]
855 |                         state_int = states[-1]
856 | 
857 |                     sx, sy = self.int_to_point(states[-1])
858 |                     trajectory.append((states, option["id"], acc_reward))
859 | 
860 |             generated_trajectories.append(trajectory)
861 | 
862 |         return np.array(generated_trajectories)
863 | 
864 | 
865 |     def generate_intra_option_trajectories(self, n_trajectories, trajectory_length, policy,
866 |                                            option, predefined_start=None, random_start=False):
867 |         """
868 |         Generate n_trajectories trajectories with length trajectory_length,
869 |         following the given policy.
870 | 
871 |         n_trajectories: Number of trajectories. int.
872 |         trajectory_length: Length of an episode. int.
873 |         policy: Map from state integers to action integers.
874 |         random_start: Whether to start randomly (default False). bool.
875 |         -> [[(state int, action int, reward float)]]
876 |         """
877 | 
878 |         trajectories = []
879 |         for _ in range(n_trajectories):
880 |             if random_start:
881 |                 sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size)
882 |             elif predefined_start != None:
883 |                 sx, sy = predefined_start
884 |             else:
885 |                 if option["id"] == 0 or option["id"] == 7:
886 |                     sx, sy = self.int_to_point(56)
887 |                 elif option["id"] == 2 or option["id"] == 1:
888 |                     sx, sy = self.int_to_point(27)
889 |                 elif option["id"] == 3 or option["id"] == 4:
890 |                     sx, sy = self.int_to_point(74)
891 |                 else:
892 |                     sx, sy = self.int_to_point(104)
893 | 
894 |             trajectory = []
895 |             for _ in range(trajectory_length):
896 |                 if rn.random() < self.wind:
897 |                     action = self.actions[rn.randint(0, 4)]
898 |                 else:
899 |                     # Follow the given policy.
900 |                     action = self.actions[policy(self.point_to_int((sx, sy)), option["id"])]
901 | 
902 |                 if (((sx + action[0],
903 |                       sy + action[1]) == option["termination"]) or (
904 |                           option["min"][0] < sx + action[0] < option["max"][0] and
905 |                           option["min"][1] < sy + action[1] < option["max"][1])
906 |                    ):
907 |                     next_sx = sx + action[0]
908 |                     next_sy = sy + action[1]
909 |                 else:
910 |                     next_sx = sx
911 |                     next_sy = sy
912 | 
913 |                 state_int = self.point_to_int((sx, sy))
914 |                 action_int = self.actions.index(action)
915 |                 next_state_int = self.point_to_int((next_sx, next_sy))
916 |                 reward = self.reward(next_state_int) # do not hardcode option
917 |                 trajectory.append((state_int, action_int, reward))
918 | 
919 |                 sx = next_sx
920 |                 sy = next_sy
921 | 
922 |             trajectories.append(trajectory)
923 | 
924 |         return np.array(trajectories)
925 | 
926 |     def generate_trajectories(self, n_trajectories, trajectory_length, policy,
927 |                               random_start=False):
928 |         """
929 |         Generate n_trajectories trajectories with length trajectory_length,
930 |         following the given policy.
931 | 
932 |         n_trajectories: Number of trajectories. int.
933 |         trajectory_length: Length of an episode. int.
934 |         policy: Map from state integers to action integers.
935 |         random_start: Whether to start randomly (default False). bool.
936 |         -> [[(state int, action int, reward float)]]
937 |         """
938 | 
939 |         trajectories = []
940 |         for _ in range(n_trajectories):
941 |             if random_start:
942 |                 sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size)
943 |             else:
944 |                 sx, sy = 0, 0
945 | 
946 |             trajectory = []
947 |             for _ in range(trajectory_length):
948 |                 if rn.random() < self.wind:
949 |                     action = self.actions[rn.randint(0, 4)]
950 |                 else:
951 |                     # Follow the given policy.
952 |                     action = self.actions[policy(self.point_to_int((sx, sy)))]
953 | 
954 |                 if (0 <= sx + action[0] < self.grid_size and
955 |                         0 <= sy + action[1] < self.grid_size):
956 |                     next_sx = sx + action[0]
957 |                     next_sy = sy + action[1]
958 |                 else:
959 |                     next_sx = sx
960 |                     next_sy = sy
961 | 
962 |                 state_int = self.point_to_int((sx, sy))
963 |                 action_int = self.actions.index(action)
964 |                 next_state_int = self.point_to_int((next_sx, next_sy))
965 |                 reward = self.reward(next_state_int)# do not hardcode option
966 |                 trajectory.append((state_int, action_int, reward))
967 | 
968 |                 sx = next_sx
969 |                 sy = next_sy
970 | 
971 |             trajectories.append(trajectory)
972 | 
973 |         return np.array(trajectories)
974 | 


--------------------------------------------------------------------------------