├── irl
    ├── __init__.py
    ├── mdp
    │   ├── __init__.py
    │   ├── gridworld_test.py
    │   ├── objectworld.py
    │   └── gridworld.py
    ├── value_iteration.py
    ├── linear_irl.py
    ├── maxent.py
    └── deep_maxent.py
├── .gitignore
├── LICENSE
├── examples
    ├── lp_gridworld.py
    ├── maxent_gridworld.py
    ├── lp_large_gridworld.py
    ├── maxent_objectworld.py
    ├── deep_maxent_objectworld.py
    └── experiments.py
└── README.md


/irl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/irl/mdp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Matthew Alger
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/irl/mdp/gridworld_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit tests for the gridworld MDP.
 3 | 
 4 | Matthew Alger, 2016
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import unittest
 9 | 
10 | import numpy as np
11 | import numpy.random as rn
12 | 
13 | import gridworld
14 | 
15 | 
16 | def make_random_gridworld():
17 |     grid_size = rn.randint(2, 15)
18 |     wind = rn.uniform(0.0, 1.0)
19 |     discount = rn.uniform(0.0, 1.0)
20 |     return gridworld.Gridworld(grid_size, wind, discount)
21 | 
22 | 
23 | class TestTransitionProbability(unittest.TestCase):
24 |     """Tests for Gridworld.transition_probability."""
25 | 
26 |     def test_sums_to_one(self):
27 |         """Tests that the sum of transition probabilities is approximately 1."""
28 |         # This is a simple fuzz-test.
29 |         for _ in range(40):
30 |             gw = make_random_gridworld()
31 |             self.assertTrue(
32 |                 np.isclose(gw.transition_probability.sum(axis=2), 1).all(),
33 |                 'Probabilities don\'t sum to 1: {}'.format(gw))
34 | 
35 |     def test_manual_sums_to_one(self):
36 |         """Tests issue #1 on GitHub."""
37 |         gw = gridworld.Gridworld(5, 0.3, 0.2)
38 |         self.assertTrue(
39 |             np.isclose(gw.transition_probability.sum(axis=2), 1).all())
40 | 
41 | if __name__ == '__main__':
42 |     unittest.main()


--------------------------------------------------------------------------------
/examples/lp_gridworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run linear programming inverse reinforcement learning on the gridworld MDP.
 3 | 
 4 | Matthew Alger, 2015
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | import irl.linear_irl as linear_irl
12 | import irl.mdp.gridworld as gridworld
13 | 
14 | def main(grid_size, discount):
15 |     """
16 |     Run linear programming inverse reinforcement learning on the gridworld MDP.
17 | 
18 |     Plots the reward function.
19 | 
20 |     grid_size: Grid size. int.
21 |     discount: MDP discount factor. float.
22 |     """
23 | 
24 |     wind = 0.3
25 |     trajectory_length = 3*grid_size
26 | 
27 |     gw = gridworld.Gridworld(grid_size, wind, discount)
28 | 
29 |     ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
30 |     policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]
31 |     r = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability,
32 |             policy, gw.discount, 1, 5)
33 | 
34 |     plt.subplot(1, 2, 1)
35 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
36 |     plt.colorbar()
37 |     plt.title("Groundtruth reward")
38 |     plt.subplot(1, 2, 2)
39 |     plt.pcolor(r.reshape((grid_size, grid_size)))
40 |     plt.colorbar()
41 |     plt.title("Recovered reward")
42 |     plt.show()
43 | 
44 | if __name__ == '__main__':
45 |     main(5, 0.2)
46 | 


--------------------------------------------------------------------------------
/examples/maxent_gridworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run maximum entropy inverse reinforcement learning on the gridworld MDP.
 3 | 
 4 | Matthew Alger, 2015
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | import irl.maxent as maxent
12 | import irl.mdp.gridworld as gridworld
13 | 
14 | def main(grid_size, discount, n_trajectories, epochs, learning_rate):
15 |     """
16 |     Run maximum entropy inverse reinforcement learning on the gridworld MDP.
17 | 
18 |     Plots the reward function.
19 | 
20 |     grid_size: Grid size. int.
21 |     discount: MDP discount factor. float.
22 |     n_trajectories: Number of sampled trajectories. int.
23 |     epochs: Gradient descent iterations. int.
24 |     learning_rate: Gradient descent learning rate. float.
25 |     """
26 | 
27 |     wind = 0.3
28 |     trajectory_length = 3*grid_size
29 | 
30 |     gw = gridworld.Gridworld(grid_size, wind, discount)
31 |     trajectories = gw.generate_trajectories(n_trajectories,
32 |                                             trajectory_length,
33 |                                             gw.optimal_policy)
34 |     feature_matrix = gw.feature_matrix()
35 |     ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
36 |     r = maxent.irl(feature_matrix, gw.n_actions, discount,
37 |         gw.transition_probability, trajectories, epochs, learning_rate)
38 | 
39 |     plt.subplot(1, 2, 1)
40 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
41 |     plt.colorbar()
42 |     plt.title("Groundtruth reward")
43 |     plt.subplot(1, 2, 2)
44 |     plt.pcolor(r.reshape((grid_size, grid_size)))
45 |     plt.colorbar()
46 |     plt.title("Recovered reward")
47 |     plt.show()
48 | 
49 | if __name__ == '__main__':
50 |     main(5, 0.01, 20, 200, 0.01)
51 | 


--------------------------------------------------------------------------------
/examples/lp_large_gridworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run large state space linear programming inverse reinforcement learning on the
 3 | gridworld MDP.
 4 | 
 5 | Matthew Alger, 2015
 6 | matthew.alger@anu.edu.au
 7 | """
 8 | 
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | 
12 | import irl.linear_irl as linear_irl
13 | import irl.mdp.gridworld as gridworld
14 | from irl.value_iteration import value
15 | 
16 | def main(grid_size, discount):
17 |     """
18 |     Run large state space linear programming inverse reinforcement learning on
19 |     the gridworld MDP.
20 | 
21 |     Plots the reward function.
22 | 
23 |     grid_size: Grid size. int.
24 |     discount: MDP discount factor. float.
25 |     """
26 | 
27 |     wind = 0.3
28 |     trajectory_length = 3*grid_size
29 | 
30 |     gw = gridworld.Gridworld(grid_size, wind, discount)
31 | 
32 |     ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
33 |     policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]
34 | 
35 |     # Need a value function for each basis function.
36 |     feature_matrix = gw.feature_matrix()
37 |     values = []
38 |     for dim in range(feature_matrix.shape[1]):
39 |         reward = feature_matrix[:, dim]
40 |         values.append(value(policy, gw.n_states, gw.transition_probability,
41 |                             reward, gw.discount))
42 |     values = np.array(values)
43 | 
44 |     r = linear_irl.large_irl(values, gw.transition_probability,
45 |                         feature_matrix, gw.n_states, gw.n_actions, policy)
46 | 
47 |     plt.subplot(1, 2, 1)
48 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
49 |     plt.colorbar()
50 |     plt.title("Groundtruth reward")
51 |     plt.subplot(1, 2, 2)
52 |     plt.pcolor(r.reshape((grid_size, grid_size)))
53 |     plt.colorbar()
54 |     plt.title("Recovered reward")
55 |     plt.show()
56 | 
57 | if __name__ == '__main__':
58 |     main(10, 0.9)
59 | 


--------------------------------------------------------------------------------
/examples/maxent_objectworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run maximum entropy inverse reinforcement learning on the objectworld MDP.
 3 | 
 4 | Matthew Alger, 2015
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | import irl.maxent as maxent
12 | import irl.mdp.objectworld as objectworld
13 | from irl.value_iteration import find_policy
14 | 
15 | def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
16 |          learning_rate):
17 |     """
18 |     Run maximum entropy inverse reinforcement learning on the objectworld MDP.
19 | 
20 |     Plots the reward function.
21 | 
22 |     grid_size: Grid size. int.
23 |     discount: MDP discount factor. float.
24 |     n_objects: Number of objects. int.
25 |     n_colours: Number of colours. int.
26 |     n_trajectories: Number of sampled trajectories. int.
27 |     epochs: Gradient descent iterations. int.
28 |     learning_rate: Gradient descent learning rate. float.
29 |     """
30 | 
31 |     wind = 0.3
32 |     trajectory_length = 8
33 | 
34 |     ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
35 |                                  discount)
36 |     ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
37 |     policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability,
38 |                          ground_r, ow.discount, stochastic=False)
39 |     trajectories = ow.generate_trajectories(n_trajectories,
40 |                                             trajectory_length,
41 |                                             lambda s: policy[s])
42 |     feature_matrix = ow.feature_matrix(discrete=False)
43 |     r = maxent.irl(feature_matrix, ow.n_actions, discount,
44 |         ow.transition_probability, trajectories, epochs, learning_rate)
45 | 
46 |     plt.subplot(1, 2, 1)
47 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
48 |     plt.colorbar()
49 |     plt.title("Groundtruth reward")
50 |     plt.subplot(1, 2, 2)
51 |     plt.pcolor(r.reshape((grid_size, grid_size)))
52 |     plt.colorbar()
53 |     plt.title("Recovered reward")
54 |     plt.show()
55 | 
56 | if __name__ == '__main__':
57 |     main(10, 0.9, 15, 2, 20, 50, 0.01)
58 | 


--------------------------------------------------------------------------------
/examples/deep_maxent_objectworld.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run maximum entropy inverse reinforcement learning on the objectworld MDP.
 3 | 
 4 | Matthew Alger, 2015
 5 | matthew.alger@anu.edu.au
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | import irl.deep_maxent as deep_maxent
12 | import irl.mdp.objectworld as objectworld
13 | from irl.value_iteration import find_policy
14 | 
15 | def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
16 |          learning_rate, structure):
17 |     """
18 |     Run deep maximum entropy inverse reinforcement learning on the objectworld
19 |     MDP.
20 | 
21 |     Plots the reward function.
22 | 
23 |     grid_size: Grid size. int.
24 |     discount: MDP discount factor. float.
25 |     n_objects: Number of objects. int.
26 |     n_colours: Number of colours. int.
27 |     n_trajectories: Number of sampled trajectories. int.
28 |     epochs: Gradient descent iterations. int.
29 |     learning_rate: Gradient descent learning rate. float.
30 |     structure: Neural network structure. Tuple of hidden layer dimensions, e.g.,
31 |         () is no neural network (linear maximum entropy) and (3, 4) is two
32 |         hidden layers with dimensions 3 and 4.
33 |     """
34 | 
35 |     wind = 0.3
36 |     trajectory_length = 8
37 |     l1 = l2 = 0
38 | 
39 |     ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
40 |                                  discount)
41 |     ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
42 |     policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability,
43 |                          ground_r, ow.discount, stochastic=False)
44 |     trajectories = ow.generate_trajectories(n_trajectories,
45 |                                             trajectory_length,
46 |                                             lambda s: policy[s])
47 |     feature_matrix = ow.feature_matrix(discrete=False)
48 |     r = deep_maxent.irl((feature_matrix.shape[1],) + structure, feature_matrix,
49 |         ow.n_actions, discount, ow.transition_probability, trajectories, epochs,
50 |         learning_rate, l1=l1, l2=l2)
51 | 
52 |     plt.subplot(1, 2, 1)
53 |     plt.pcolor(ground_r.reshape((grid_size, grid_size)))
54 |     plt.colorbar()
55 |     plt.title("Groundtruth reward")
56 |     plt.subplot(1, 2, 2)
57 |     plt.pcolor(r.reshape((grid_size, grid_size)))
58 |     plt.colorbar()
59 |     plt.title("Recovered reward")
60 |     plt.show()
61 | 
62 | if __name__ == '__main__':
63 |     main(10, 0.9, 15, 2, 20, 50, 0.01, (3, 3))
64 | 


--------------------------------------------------------------------------------
/irl/value_iteration.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Find the value function associated with a policy. Based on Sutton & Barto, 1998.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | import numpy as np
  9 | 
 10 | def value(policy, n_states, transition_probabilities, reward, discount,
 11 |                     threshold=1e-2):
 12 |     """
 13 |     Find the value function associated with a policy.
 14 | 
 15 |     policy: List of action ints for each state.
 16 |     n_states: Number of states. int.
 17 |     transition_probabilities: Function taking (state, action, state) to
 18 |         transition probabilities.
 19 |     reward: Vector of rewards for each state.
 20 |     discount: MDP discount factor. float.
 21 |     threshold: Convergence threshold, default 1e-2. float.
 22 |     -> Array of values for each state
 23 |     """
 24 |     v = np.zeros(n_states)
 25 | 
 26 |     diff = float("inf")
 27 |     while diff > threshold:
 28 |         diff = 0
 29 |         for s in range(n_states):
 30 |             vs = v[s]
 31 |             a = policy[s]
 32 |             v[s] = sum(transition_probabilities[s, a, k] *
 33 |                        (reward[k] + discount * v[k])
 34 |                        for k in range(n_states))
 35 |             diff = max(diff, abs(vs - v[s]))
 36 | 
 37 |     return v
 38 | 
 39 | def optimal_value(n_states, n_actions, transition_probabilities, reward,
 40 |                   discount, threshold=1e-2):
 41 |     """
 42 |     Find the optimal value function.
 43 | 
 44 |     n_states: Number of states. int.
 45 |     n_actions: Number of actions. int.
 46 |     transition_probabilities: Function taking (state, action, state) to
 47 |         transition probabilities.
 48 |     reward: Vector of rewards for each state.
 49 |     discount: MDP discount factor. float.
 50 |     threshold: Convergence threshold, default 1e-2. float.
 51 |     -> Array of values for each state
 52 |     """
 53 | 
 54 |     v = np.zeros(n_states)
 55 | 
 56 |     diff = float("inf")
 57 |     while diff > threshold:
 58 |         diff = 0
 59 |         for s in range(n_states):
 60 |             max_v = float("-inf")
 61 |             for a in range(n_actions):
 62 |                 tp = transition_probabilities[s, a, :]
 63 |                 max_v = max(max_v, np.dot(tp, reward + discount*v))
 64 | 
 65 |             new_diff = abs(v[s] - max_v)
 66 |             if new_diff > diff:
 67 |                 diff = new_diff
 68 |             v[s] = max_v
 69 | 
 70 |     return v
 71 | 
 72 | def find_policy(n_states, n_actions, transition_probabilities, reward, discount,
 73 |                 threshold=1e-2, v=None, stochastic=True):
 74 |     """
 75 |     Find the optimal policy.
 76 | 
 77 |     n_states: Number of states. int.
 78 |     n_actions: Number of actions. int.
 79 |     transition_probabilities: Function taking (state, action, state) to
 80 |         transition probabilities.
 81 |     reward: Vector of rewards for each state.
 82 |     discount: MDP discount factor. float.
 83 |     threshold: Convergence threshold, default 1e-2. float.
 84 |     v: Value function (if known). Default None.
 85 |     stochastic: Whether the policy should be stochastic. Default True.
 86 |     -> Action probabilities for each state or action int for each state
 87 |         (depending on stochasticity).
 88 |     """
 89 | 
 90 |     if v is None:
 91 |         v = optimal_value(n_states, n_actions, transition_probabilities, reward,
 92 |                           discount, threshold)
 93 | 
 94 |     if stochastic:
 95 |         # Get Q using equation 9.2 from Ziebart's thesis.
 96 |         Q = np.zeros((n_states, n_actions))
 97 |         for i in range(n_states):
 98 |             for j in range(n_actions):
 99 |                 p = transition_probabilities[i, j, :]
100 |                 Q[i, j] = p.dot(reward + discount*v)
101 |         Q -= Q.max(axis=1).reshape((n_states, 1))  # For numerical stability.
102 |         Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1))
103 |         return Q
104 | 
105 |     def _policy(s):
106 |         return max(range(n_actions),
107 |                    key=lambda a: sum(transition_probabilities[s, a, k] *
108 |                                      (reward[k] + discount * v[k])
109 |                                      for k in range(n_states)))
110 |     policy = np.array([_policy(s) for s in range(n_states)])
111 |     return policy
112 | 
113 | if __name__ == '__main__':
114 |     # Quick unit test using gridworld.
115 |     import mdp.gridworld as gridworld
116 |     gw = gridworld.Gridworld(3, 0.3, 0.9)
117 |     v = value([gw.optimal_policy_deterministic(s) for s in range(gw.n_states)],
118 |               gw.n_states,
119 |               gw.transition_probability,
120 |               [gw.reward(s) for s in range(gw.n_states)],
121 |               gw.discount)
122 |     assert np.isclose(v,
123 |                       [5.7194282, 6.46706692, 6.42589811,
124 |                        6.46706692, 7.47058224, 7.96505174,
125 |                        6.42589811, 7.96505174, 8.19268666], 1).all()
126 |     opt_v = optimal_value(gw.n_states,
127 |                           gw.n_actions,
128 |                           gw.transition_probability,
129 |                           [gw.reward(s) for s in range(gw.n_states)],
130 |                           gw.discount)
131 |     assert np.isclose(v, opt_v).all()
132 | 


--------------------------------------------------------------------------------
/irl/mdp/objectworld.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements the objectworld MDP described in Levine et al. 2011.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | import math
  9 | from itertools import product
 10 | 
 11 | import numpy as np
 12 | import numpy.random as rn
 13 | 
 14 | from .gridworld import Gridworld
 15 | 
 16 | class OWObject(object):
 17 |     """
 18 |     Object in objectworld.
 19 |     """
 20 | 
 21 |     def __init__(self, inner_colour, outer_colour):
 22 |         """
 23 |         inner_colour: Inner colour of object. int.
 24 |         outer_colour: Outer colour of object. int.
 25 |         -> OWObject
 26 |         """
 27 | 
 28 |         self.inner_colour = inner_colour
 29 |         self.outer_colour = outer_colour
 30 | 
 31 |     def __str__(self):
 32 |         """
 33 |         A string representation of this object.
 34 | 
 35 |         -> __str__
 36 |         """
 37 | 
 38 |         return "<OWObject (In: {}) (Out: {})>".format(self.inner_colour,
 39 |                                                       self.outer_colour)
 40 | 
 41 | class Objectworld(Gridworld):
 42 |     """
 43 |     Objectworld MDP.
 44 |     """
 45 | 
 46 |     def __init__(self, grid_size, n_objects, n_colours, wind, discount):
 47 |         """
 48 |         grid_size: Grid size. int.
 49 |         n_objects: Number of objects in the world. int.
 50 |         n_colours: Number of colours to colour objects with. int.
 51 |         wind: Chance of moving randomly. float.
 52 |         discount: MDP discount. float.
 53 |         -> Objectworld
 54 |         """
 55 | 
 56 |         super().__init__(grid_size, wind, discount)
 57 | 
 58 |         self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1), (0, 0))
 59 |         self.n_actions = len(self.actions)
 60 |         self.n_objects = n_objects
 61 |         self.n_colours = n_colours
 62 | 
 63 |         # Generate objects.
 64 |         self.objects = {}
 65 |         for _ in range(self.n_objects):
 66 |             obj = OWObject(rn.randint(self.n_colours),
 67 |                            rn.randint(self.n_colours))
 68 | 
 69 |             while True:
 70 |                 x = rn.randint(self.grid_size)
 71 |                 y = rn.randint(self.grid_size)
 72 | 
 73 |                 if (x, y) not in self.objects:
 74 |                     break
 75 | 
 76 |             self.objects[x, y] = obj
 77 | 
 78 |         # Preconstruct the transition probability array.
 79 |         self.transition_probability = np.array(
 80 |             [[[self._transition_probability(i, j, k)
 81 |                for k in range(self.n_states)]
 82 |               for j in range(self.n_actions)]
 83 |              for i in range(self.n_states)])
 84 | 
 85 |     def feature_vector(self, i, discrete=True):
 86 |         """
 87 |         Get the feature vector associated with a state integer.
 88 | 
 89 |         i: State int.
 90 |         discrete: Whether the feature vectors should be discrete (default True).
 91 |             bool.
 92 |         -> Feature vector.
 93 |         """
 94 | 
 95 |         sx, sy = self.int_to_point(i)
 96 | 
 97 |         nearest_inner = {}  # colour: distance
 98 |         nearest_outer = {}  # colour: distance
 99 | 
100 |         for y in range(self.grid_size):
101 |             for x in range(self.grid_size):
102 |                 if (x, y) in self.objects:
103 |                     dist = math.hypot((x - sx), (y - sy))
104 |                     obj = self.objects[x, y]
105 |                     if obj.inner_colour in nearest_inner:
106 |                         if dist < nearest_inner[obj.inner_colour]:
107 |                             nearest_inner[obj.inner_colour] = dist
108 |                     else:
109 |                         nearest_inner[obj.inner_colour] = dist
110 |                     if obj.outer_colour in nearest_outer:
111 |                         if dist < nearest_outer[obj.outer_colour]:
112 |                             nearest_outer[obj.outer_colour] = dist
113 |                     else:
114 |                         nearest_outer[obj.outer_colour] = dist
115 | 
116 |         # Need to ensure that all colours are represented.
117 |         for c in range(self.n_colours):
118 |             if c not in nearest_inner:
119 |                 nearest_inner[c] = 0
120 |             if c not in nearest_outer:
121 |                 nearest_outer[c] = 0
122 | 
123 |         if discrete:
124 |             state = np.zeros((2*self.n_colours*self.grid_size,))
125 |             i = 0
126 |             for c in range(self.n_colours):
127 |                 for d in range(1, self.grid_size+1):
128 |                     if nearest_inner[c] < d:
129 |                         state[i] = 1
130 |                     i += 1
131 |                     if nearest_outer[c] < d:
132 |                         state[i] = 1
133 |                     i += 1
134 |             assert i == 2*self.n_colours*self.grid_size
135 |             assert (state >= 0).all()
136 |         else:
137 |             # Continuous features.
138 |             state = np.zeros((2*self.n_colours))
139 |             i = 0
140 |             for c in range(self.n_colours):
141 |                 state[i] = nearest_inner[c]
142 |                 i += 1
143 |                 state[i] = nearest_outer[c]
144 |                 i += 1
145 | 
146 |         return state
147 | 
148 |     def feature_matrix(self, discrete=True):
149 |         """
150 |         Get the feature matrix for this objectworld.
151 | 
152 |         discrete: Whether the feature vectors should be discrete (default True).
153 |             bool.
154 |         -> NumPy array with shape (n_states, n_states).
155 |         """
156 | 
157 |         return np.array([self.feature_vector(i, discrete)
158 |                          for i in range(self.n_states)])
159 | 
160 |     def reward(self, state_int):
161 |         """
162 |         Get the reward for a state int.
163 | 
164 |         state_int: State int.
165 |         -> reward float
166 |         """
167 | 
168 |         x, y = self.int_to_point(state_int)
169 | 
170 |         near_c0 = False
171 |         near_c1 = False
172 |         for (dx, dy) in product(range(-3, 4), range(-3, 4)):
173 |             if 0 <= x + dx < self.grid_size and 0 <= y + dy < self.grid_size:
174 |                 if (abs(dx) + abs(dy) <= 3 and
175 |                         (x+dx, y+dy) in self.objects and
176 |                         self.objects[x+dx, y+dy].outer_colour == 0):
177 |                     near_c0 = True
178 |                 if (abs(dx) + abs(dy) <= 2 and
179 |                         (x+dx, y+dy) in self.objects and
180 |                         self.objects[x+dx, y+dy].outer_colour == 1):
181 |                     near_c1 = True
182 | 
183 |         if near_c0 and near_c1:
184 |             return 1
185 |         if near_c0:
186 |             return -1
187 |         return 0
188 | 
189 |     def generate_trajectories(self, n_trajectories, trajectory_length, policy):
190 |         """
191 |         Generate n_trajectories trajectories with length trajectory_length.
192 | 
193 |         n_trajectories: Number of trajectories. int.
194 |         trajectory_length: Length of an episode. int.
195 |         policy: Map from state integers to action integers.
196 |         -> [[(state int, action int, reward float)]]
197 |         """
198 | 
199 |         return super().generate_trajectories(n_trajectories, trajectory_length,
200 |                                              policy,
201 |                                              True)
202 | 
203 |     def optimal_policy(self, state_int):
204 |         raise NotImplementedError(
205 |             "Optimal policy is not implemented for Objectworld.")
206 |     def optimal_policy_deterministic(self, state_int):
207 |         raise NotImplementedError(
208 |             "Optimal policy is not implemented for Objectworld.")
209 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Inverse Reinforcement Learning
  2 | 
  3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.555999.svg)](https://doi.org/10.5281/zenodo.555999)
  4 | 
  5 | Implements selected inverse reinforcement learning (IRL) algorithms as part of COMP3710, supervised by Dr Mayank Daswani and Dr Marcus Hutter. My final report is available [here](https://alger.au/pdfs/irl.pdf) and describes the implemented algorithms.
  6 | 
  7 | If you use this code in your work, you can cite it as follows:
  8 | ```bibtex
  9 | @misc{alger16,
 10 |   author       = {Matthew Alger},
 11 |   title        = {Inverse Reinforcement Learning},
 12 |   year         = 2016,
 13 |   doi          = {10.5281/zenodo.555999},
 14 |   url          = {https://doi.org/10.5281/zenodo.555999}
 15 | }
 16 | ```
 17 | 
 18 | ## Algorithms implemented
 19 | 
 20 | - Linear programming IRL. From Ng & Russell, 2000. Small state space and large state space linear programming IRL.
 21 | - Maximum entropy IRL. From Ziebart et al., 2008.
 22 | - Deep maximum entropy IRL. From Wulfmeier et al., 2015; original derivation.
 23 | 
 24 | Additionally, the following MDP domains are implemented:
 25 | - Gridworld (Sutton, 1998)
 26 | - Objectworld (Levine et al., 2011)
 27 | 
 28 | ## Requirements
 29 | - NumPy
 30 | - SciPy
 31 | - CVXOPT
 32 | - Theano
 33 | - MatPlotLib (for examples)
 34 | 
 35 | ## Module documentation
 36 | 
 37 | Following is a brief list of functions and classes exported by modules. Full documentation is included in the docstrings of each function or class; only functions and classes intended for use outside the module are documented here.
 38 | 
 39 | ### linear_irl
 40 | 
 41 | Implements linear programming inverse reinforcement learning (Ng & Russell, 2000).
 42 | 
 43 | **Functions:**
 44 | 
 45 | - `irl(n_states, n_actions, transition_probability, policy, discount, Rmax, l1)`: Find a reward function with inverse RL.
 46 | - `large_inverseRL(value, transition_probability, feature_matrix, n_states, n_actions, policy)`: Find the reward in a large state space.
 47 | 
 48 | ### maxent
 49 |     
 50 | Implements maximum entropy inverse reinforcement learning (Ziebart et al., 2008).
 51 | 
 52 | **Functions:**
 53 | 
 54 | - `irl(feature_matrix, n_actions, discount, transition_probability, trajectories, epochs, learning_rate)`: Find the reward function for the given trajectories.
 55 | - `find_svf(feature_matrix, n_actions, discount, transition_probability, trajectories, epochs, learning_rate)`: Find the state visitation frequency from trajectories.
 56 | - `find_feature_expectations(feature_matrix, trajectories)`:  Find the feature expectations for the given trajectories. This is the average path feature vector.
 57 | - `find_expected_svf(n_states, r, n_actions, discount, transition_probability, trajectories)`: Find the expected state visitation frequencies using algorithm 1 from Ziebart et al. 2008.
 58 | - `expected_value_difference(n_states, n_actions, transition_probability, reward, discount, p_start_state, optimal_value, true_reward)`: Calculate the expected value difference, which is a proxy to how good a recovered reward function is.
 59 | 
 60 | ### deep_maxent
 61 | 
 62 | Implements deep maximum entropy inverse reinforcement learning based on Ziebart et al., 2008 and Wulfmeier et al., 2015, using symbolic methods with Theano.
 63 | 
 64 | **Functions:**
 65 | 
 66 | - `irl(structure, feature_matrix, n_actions, discount, transition_probability, trajectories, epochs, learning_rate, initialisation="normal", l1=0.1, l2=0.1)`: Find the reward function for the given trajectories.
 67 | - `find_svf(n_states, trajectories)`: Find the state vistiation frequency from trajectories.
 68 | - `find_expected_svf(n_states, r, n_actions, discount, transition_probability, trajectories)`: Find the expected state visitation frequencies using algorithm 1 from Ziebart et al. 2008.
 69 | 
 70 | ### value_iteration
 71 | 
 72 | Find the value function associated with a policy. Based on Sutton & Barto, 1998.
 73 | 
 74 | **Functions:**
 75 | 
 76 | - `value(policy, n_states, transition_probabilities, reward, discount, threshold=1e-2)`: Find the value function associated with a policy.
 77 | - `optimal_value(n_states, n_actions, transition_probabilities, reward, discount, threshold=1e-2)`: Find the optimal value function.
 78 | - `find_policy(n_states, n_actions, transition_probabilities, reward, discount, threshold=1e-2, v=None, stochastic=True)`: Find the optimal policy.
 79 | 
 80 | ### mdp
 81 | 
 82 | #### gridworld
 83 | 
 84 | Implements the gridworld MDP.
 85 | 
 86 | **Classes, instance attributes, methods:**
 87 | 
 88 | - `Gridworld(grid_size, wind, discount)`: Gridworld MDP.
 89 |     - `actions`: Tuple of (dx, dy) actions.
 90 |     - `n_actions`: Number of actions. int.
 91 |     - `n_states`: Number of states. int.
 92 |     - `grid_size`: Size of grid. int.
 93 |     - `wind`: Chance of moving randomly. float.
 94 |     - `discount`: MDP discount factor. float.
 95 |     - `transition_probability`: NumPy array with shape (n_states, n_actions, n_states) where `transition_probability[si, a, sk]` is the probability of transitioning from state si to state sk under action a.
 96 |     - `feature_vector(i, feature_map="ident")`: Get the feature vector associated with a state integer.
 97 |     - `feature_matrix(feature_map="ident")`: Get the feature matrix for this gridworld.
 98 |     - `int_to_point(i)`: Convert a state int into the corresponding coordinate.
 99 |     - `point_to_int(p)`: Convert a coordinate into the corresponding state int.
100 |     - `neighbouring(i, k)`: Get whether two points neighbour each other. Also returns true if they are the same point.
101 |     - `reward(state_int)`: Reward for being in state state_int.
102 |     - `average_reward(n_trajectories, trajectory_length, policy)`: Calculate the average total reward obtained by following a given policy over n_paths paths.
103 |     - `optimal_policy(state_int)`: The optimal policy for this gridworld.
104 |     - `optimal_policy_deterministic(state_int)`: Deterministic version of the optimal policy for this gridworld.
105 |     - `generate_trajectories(n_trajectories, trajectory_length, policy, random_start=False)`: Generate n_trajectories trajectories with length trajectory_length, following the given policy.
106 | 
107 | #### objectworld
108 | 
109 | Implements the objectworld MDP described in Levine et al. 2011.
110 | 
111 | **Classes, instance attributes, methods:**
112 | 
113 | - `OWObject(inner_colour, outer_colour)`: Object in objectworld.
114 |     - `inner_colour`: Inner colour of object. int.
115 |     - `outer_colour`: Outer colour of object. int.
116 | 
117 | - `Objectworld(grid_size, n_objects, n_colours, wind, discount)`: Objectworld MDP.
118 |     - `actions`: Tuple of (dx, dy) actions.
119 |     - `n_actions`: Number of actions. int.
120 |     - `n_states`: Number of states. int.
121 |     - `grid_size`: Size of grid. int.
122 |     - `n_objects`: Number of objects in the world. int.
123 |     - `n_colours`: Number of colours to colour objects with. int.
124 |     - `wind`: Chance of moving randomly. float.
125 |     - `discount`: MDP discount factor. float.
126 |     - `objects`: Set of objects in the world.
127 |     - `transition_probability`: NumPy array with shape (n_states, n_actions, n_states) where `transition_probability[si, a, sk]` is the probability of transitioning from state si to state sk under action a.
128 |     - `feature_vector(i, discrete=True)`: Get the feature vector associated with a state integer.
129 |     - `feature_matrix(discrete=True)`: Get the feature matrix for this gridworld.
130 |     - `int_to_point(i)`: Convert a state int into the corresponding coordinate.
131 |     - `point_to_int(p)`: Convert a coordinate into the corresponding state int.
132 |     - `neighbouring(i, k)`: Get whether two points neighbour each other. Also returns true if they are the same point.
133 |     - `reward(state_int)`: Reward for being in state state_int.
134 |     - `average_reward(n_trajectories, trajectory_length, policy)`: Calculate the average total reward obtained by following a given policy over n_paths paths.
135 |     - `generate_trajectories(n_trajectories, trajectory_length, policy)`: Generate n_trajectories trajectories with length trajectory_length, following the given policy.
136 | 


--------------------------------------------------------------------------------
/irl/linear_irl.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements LP IRL from Ng & Russell, 2000.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | import random
  9 | 
 10 | import numpy as np
 11 | from cvxopt import matrix, solvers
 12 | 
 13 | def irl(n_states, n_actions, transition_probability, policy, discount, Rmax,
 14 |         l1):
 15 |     """
 16 |     Find a reward function with inverse RL as described in Ng & Russell, 2000.
 17 | 
 18 |     n_states: Number of states. int.
 19 |     n_actions: Number of actions. int.
 20 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
 21 |         the probability of transitioning from state_i to state_k under action.
 22 |         Shape (N, A, N).
 23 |     policy: Vector mapping state ints to action ints. Shape (N,).
 24 |     discount: Discount factor. float.
 25 |     Rmax: Maximum reward. float.
 26 |     l1: l1 regularisation. float.
 27 |     -> Reward vector
 28 |     """
 29 | 
 30 |     A = set(range(n_actions))  # Set of actions to help manage reordering
 31 |                                # actions.
 32 |     # The transition policy convention is different here to the rest of the code
 33 |     # for legacy reasons; here, we reorder axes to fix this. We expect the
 34 |     # new probabilities to be of the shape (A, N, N).
 35 |     transition_probability = np.transpose(transition_probability, (1, 0, 2))
 36 | 
 37 |     def T(a, s):
 38 |         """
 39 |         Shorthand for a dot product used a lot in the LP formulation.
 40 |         """
 41 | 
 42 |         return np.dot(transition_probability[policy[s], s] -
 43 |                       transition_probability[a, s],
 44 |                       np.linalg.inv(np.eye(n_states) -
 45 |                         discount*transition_probability[policy[s]]))
 46 | 
 47 |     # This entire function just computes the block matrices used for the LP
 48 |     # formulation of IRL.
 49 | 
 50 |     # Minimise c . x.
 51 |     c = -np.hstack([np.zeros(n_states), np.ones(n_states),
 52 |                     -l1*np.ones(n_states)])
 53 |     zero_stack1 = np.zeros((n_states*(n_actions-1), n_states))
 54 |     T_stack = np.vstack([
 55 |         -T(a, s)
 56 |         for s in range(n_states)
 57 |         for a in A - {policy[s]}
 58 |     ])
 59 |     I_stack1 = np.vstack([
 60 |         np.eye(1, n_states, s)
 61 |         for s in range(n_states)
 62 |         for a in A - {policy[s]}
 63 |     ])
 64 |     I_stack2 = np.eye(n_states)
 65 |     zero_stack2 = np.zeros((n_states, n_states))
 66 | 
 67 |     D_left = np.vstack([T_stack, T_stack, -I_stack2, I_stack2])
 68 |     D_middle = np.vstack([I_stack1, zero_stack1, zero_stack2, zero_stack2])
 69 |     D_right = np.vstack([zero_stack1, zero_stack1, -I_stack2, -I_stack2])
 70 | 
 71 |     D = np.hstack([D_left, D_middle, D_right])
 72 |     b = np.zeros((n_states*(n_actions-1)*2 + 2*n_states, 1))
 73 |     bounds = np.array([(None, None)]*2*n_states + [(-Rmax, Rmax)]*n_states)
 74 | 
 75 |     # We still need to bound R. To do this, we just add
 76 |     # -I R <= Rmax 1
 77 |     # I R <= Rmax 1
 78 |     # So to D we need to add -I and I, and to b we need to add Rmax 1 and Rmax 1
 79 |     D_bounds = np.hstack([
 80 |         np.vstack([
 81 |             -np.eye(n_states),
 82 |             np.eye(n_states)]),
 83 |         np.vstack([
 84 |             np.zeros((n_states, n_states)),
 85 |             np.zeros((n_states, n_states))]),
 86 |         np.vstack([
 87 |             np.zeros((n_states, n_states)),
 88 |             np.zeros((n_states, n_states))])])
 89 |     b_bounds = np.vstack([Rmax*np.ones((n_states, 1))]*2)
 90 |     D = np.vstack((D, D_bounds))
 91 |     b = np.vstack((b, b_bounds))
 92 |     A_ub = matrix(D)
 93 |     b = matrix(b)
 94 |     c = matrix(c)
 95 |     results = solvers.lp(c, A_ub, b)
 96 |     r = np.asarray(results["x"][:n_states], dtype=np.double)
 97 | 
 98 |     return r.reshape((n_states,))
 99 | 
100 | def v_tensor(value, transition_probability, feature_dimension, n_states,
101 |              n_actions, policy):
102 |     """
103 |     Finds the v tensor used in large linear IRL.
104 | 
105 |     value: NumPy matrix for the value function. The (i, j)th component
106 |         represents the value of the jth state under the ith basis function.
107 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
108 |         the probability of transitioning from state_i to state_k under action.
109 |         Shape (N, A, N).
110 |     feature_dimension: Dimension of the feature matrix. int.
111 |     n_states: Number of states sampled. int.
112 |     n_actions: Number of actions. int.
113 |     policy: NumPy array mapping state ints to action ints.
114 |     -> v helper tensor.
115 |     """
116 | 
117 |     v = np.zeros((n_states, n_actions-1, feature_dimension))
118 |     for i in range(n_states):
119 |         a1 = policy[i]
120 |         exp_on_policy = np.dot(transition_probability[i, a1], value.T)
121 |         seen_policy_action = False
122 |         for j in range(n_actions):
123 |             # Skip this if it's the on-policy action.
124 |             if a1 == j:
125 |                 seen_policy_action = True
126 |                 continue
127 | 
128 |             exp_off_policy = np.dot(transition_probability[i, j], value.T)
129 |             if seen_policy_action:
130 |                 v[i, j-1] = exp_on_policy - exp_off_policy
131 |             else:
132 |                 v[i, j] = exp_on_policy - exp_off_policy
133 |     return v
134 | 
135 | def large_irl(value, transition_probability, feature_matrix, n_states,
136 |               n_actions, policy):
137 |     """
138 |     Find the reward in a large state space.
139 | 
140 |     value: NumPy matrix for the value function. The (i, j)th component
141 |         represents the value of the jth state under the ith basis function.
142 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
143 |         the probability of transitioning from state_i to state_k under action.
144 |         Shape (N, A, N).
145 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
146 |         array with shape (N, D) where N is the number of states and D is the
147 |         dimensionality of the state. 
148 |     n_states: Number of states sampled. int.
149 |     n_actions: Number of actions. int.
150 |     policy: NumPy array mapping state ints to action ints.
151 |     -> Reward for each state in states.
152 |     """
153 | 
154 |     D = feature_matrix.shape[1]
155 | 
156 |     # First, calculate v, which is just a helper tensor.
157 |     v = v_tensor(value, transition_probability, D, n_states, n_actions, policy)
158 | 
159 |     # Now we can calculate c, G, h, A, and b.
160 | 
161 |     # x = [z y_i^+ y_i^- a], which is a [N (K-1)*N (K-1)*N D] vector.
162 |     x_size = n_states + (n_actions-1)*n_states*2 + D
163 | 
164 |     # c is a big stack of ones and zeros; there's N ones and the rest is zero.
165 |     c = -np.hstack([np.ones(n_states), np.zeros(x_size - n_states)])
166 |     assert c.shape[0] == x_size
167 | 
168 |     # A is [0 I_j -I_j -v^T_{ij}] and j NOT EQUAL TO policy(i).
169 |     # I believe this is accounted for by the structure of v.
170 |     A = np.hstack([
171 |         np.zeros((n_states*(n_actions-1), n_states)),
172 |         np.eye(n_states*(n_actions-1)),
173 |         -np.eye(n_states*(n_actions-1)),
174 |         np.vstack([v[i, j].T for i in range(n_states)
175 |                              for j in range(n_actions-1)])])
176 |     assert A.shape[1] == x_size
177 | 
178 |     # b is just zeros!
179 |     b = np.zeros(A.shape[0])
180 | 
181 |     # Break G up into the bottom row and other rows to construct it.
182 |     bottom_row = np.vstack([
183 |                     np.hstack([
184 |                         np.ones((n_actions-1, 1)).dot(np.eye(1, n_states, l)),
185 |                         np.hstack([-np.eye(n_actions-1) if i == l
186 |                                    else np.zeros((n_actions-1, n_actions-1))
187 |                          for i in range(n_states)]),
188 |                         np.hstack([2*np.eye(n_actions-1) if i == l
189 |                                    else np.zeros((n_actions-1, n_actions-1))
190 |                          for i in range(n_states)]),
191 |                         np.zeros((n_actions-1, D))])
192 |                     for l in range(n_states)])
193 |     assert bottom_row.shape[1] == x_size
194 |     G = np.vstack([
195 |             np.hstack([
196 |                 np.zeros((D, n_states)),
197 |                 np.zeros((D, n_states*(n_actions-1))),
198 |                 np.zeros((D, n_states*(n_actions-1))),
199 |                 np.eye(D)]),
200 |             np.hstack([
201 |                 np.zeros((D, n_states)),
202 |                 np.zeros((D, n_states*(n_actions-1))),
203 |                 np.zeros((D, n_states*(n_actions-1))),
204 |                 -np.eye(D)]),
205 |             np.hstack([
206 |                 np.zeros((n_states*(n_actions-1), n_states)),
207 |                 -np.eye(n_states*(n_actions-1)),
208 |                 np.zeros((n_states*(n_actions-1), n_states*(n_actions-1))),
209 |                 np.zeros((n_states*(n_actions-1), D))]),
210 |             np.hstack([
211 |                 np.zeros((n_states*(n_actions-1), n_states)),
212 |                 np.zeros((n_states*(n_actions-1), n_states*(n_actions-1))),
213 |                 -np.eye(n_states*(n_actions-1)),
214 |                 np.zeros((n_states*(n_actions-1), D))]),
215 |             bottom_row])
216 |     assert G.shape[1] == x_size
217 | 
218 |     h = np.vstack([np.ones((D*2, 1)),
219 |                    np.zeros((n_states*(n_actions-1)*2+bottom_row.shape[0], 1))])
220 | 
221 |     from cvxopt import matrix, solvers
222 |     c = matrix(c)
223 |     G = matrix(G)
224 |     h = matrix(h)
225 |     A = matrix(A)
226 |     b = matrix(b)
227 |     results = solvers.lp(c, G, h, A, b)
228 |     alpha = np.asarray(results["x"][-D:], dtype=np.double)
229 |     return np.dot(feature_matrix, -alpha)
230 | 


--------------------------------------------------------------------------------
/irl/maxent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements maximum entropy inverse reinforcement learning (Ziebart et al., 2008)
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | from itertools import product
  9 | 
 10 | import numpy as np
 11 | import numpy.random as rn
 12 | 
 13 | from . import value_iteration
 14 | 
 15 | def irl(feature_matrix, n_actions, discount, transition_probability,
 16 |         trajectories, epochs, learning_rate):
 17 |     """
 18 |     Find the reward function for the given trajectories.
 19 | 
 20 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
 21 |         array with shape (N, D) where N is the number of states and D is the
 22 |         dimensionality of the state.
 23 |     n_actions: Number of actions A. int.
 24 |     discount: Discount factor of the MDP. float.
 25 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
 26 |         the probability of transitioning from state_i to state_k under action.
 27 |         Shape (N, A, N).
 28 |     trajectories: 3D array of state/action pairs. States are ints, actions
 29 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 30 |         trajectories and L is the trajectory length.
 31 |     epochs: Number of gradient descent steps. int.
 32 |     learning_rate: Gradient descent learning rate. float.
 33 |     -> Reward vector with shape (N,).
 34 |     """
 35 | 
 36 |     n_states, d_states = feature_matrix.shape
 37 | 
 38 |     # Initialise weights.
 39 |     alpha = rn.uniform(size=(d_states,))
 40 | 
 41 |     # Calculate the feature expectations \tilde{phi}.
 42 |     feature_expectations = find_feature_expectations(feature_matrix,
 43 |                                                      trajectories)
 44 | 
 45 |     # Gradient descent on alpha.
 46 |     for i in range(epochs):
 47 |         # print("i: {}".format(i))
 48 |         r = feature_matrix.dot(alpha)
 49 |         expected_svf = find_expected_svf(n_states, r, n_actions, discount,
 50 |                                          transition_probability, trajectories)
 51 |         grad = feature_expectations - feature_matrix.T.dot(expected_svf)
 52 | 
 53 |         alpha += learning_rate * grad
 54 | 
 55 |     return feature_matrix.dot(alpha).reshape((n_states,))
 56 | 
 57 | def find_svf(n_states, trajectories):
 58 |     """
 59 |     Find the state visitation frequency from trajectories.
 60 | 
 61 |     n_states: Number of states. int.
 62 |     trajectories: 3D array of state/action pairs. States are ints, actions
 63 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 64 |         trajectories and L is the trajectory length.
 65 |     -> State visitation frequencies vector with shape (N,).
 66 |     """
 67 | 
 68 |     svf = np.zeros(n_states)
 69 | 
 70 |     for trajectory in trajectories:
 71 |         for state, _, _ in trajectory:
 72 |             svf[state] += 1
 73 | 
 74 |     svf /= trajectories.shape[0]
 75 | 
 76 |     return svf
 77 | 
 78 | def find_feature_expectations(feature_matrix, trajectories):
 79 |     """
 80 |     Find the feature expectations for the given trajectories. This is the
 81 |     average path feature vector.
 82 | 
 83 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
 84 |         array with shape (N, D) where N is the number of states and D is the
 85 |         dimensionality of the state.
 86 |     trajectories: 3D array of state/action pairs. States are ints, actions
 87 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 88 |         trajectories and L is the trajectory length.
 89 |     -> Feature expectations vector with shape (D,).
 90 |     """
 91 | 
 92 |     feature_expectations = np.zeros(feature_matrix.shape[1])
 93 | 
 94 |     for trajectory in trajectories:
 95 |         for state, _, _ in trajectory:
 96 |             feature_expectations += feature_matrix[state]
 97 | 
 98 |     feature_expectations /= trajectories.shape[0]
 99 | 
100 |     return feature_expectations
101 | 
102 | def find_expected_svf(n_states, r, n_actions, discount,
103 |                       transition_probability, trajectories):
104 |     """
105 |     Find the expected state visitation frequencies using algorithm 1 from
106 |     Ziebart et al. 2008.
107 | 
108 |     n_states: Number of states N. int.
109 |     alpha: Reward. NumPy array with shape (N,).
110 |     n_actions: Number of actions A. int.
111 |     discount: Discount factor of the MDP. float.
112 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
113 |         the probability of transitioning from state_i to state_k under action.
114 |         Shape (N, A, N).
115 |     trajectories: 3D array of state/action pairs. States are ints, actions
116 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
117 |         trajectories and L is the trajectory length.
118 |     -> Expected state visitation frequencies vector with shape (N,).
119 |     """
120 | 
121 |     n_trajectories = trajectories.shape[0]
122 |     trajectory_length = trajectories.shape[1]
123 | 
124 |     # policy = find_policy(n_states, r, n_actions, discount,
125 |     #                                 transition_probability)
126 |     policy = value_iteration.find_policy(n_states, n_actions,
127 |                                          transition_probability, r, discount)
128 | 
129 |     start_state_count = np.zeros(n_states)
130 |     for trajectory in trajectories:
131 |         start_state_count[trajectory[0, 0]] += 1
132 |     p_start_state = start_state_count/n_trajectories
133 | 
134 |     expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T
135 |     for t in range(1, trajectory_length):
136 |         expected_svf[:, t] = 0
137 |         for i, j, k in product(range(n_states), range(n_actions), range(n_states)):
138 |             expected_svf[k, t] += (expected_svf[i, t-1] *
139 |                                   policy[i, j] * # Stochastic policy
140 |                                   transition_probability[i, j, k])
141 | 
142 |     return expected_svf.sum(axis=1)
143 | 
144 | def softmax(x1, x2):
145 |     """
146 |     Soft-maximum calculation, from algorithm 9.2 in Ziebart's PhD thesis.
147 | 
148 |     x1: float.
149 |     x2: float.
150 |     -> softmax(x1, x2)
151 |     """
152 | 
153 |     max_x = max(x1, x2)
154 |     min_x = min(x1, x2)
155 |     return max_x + np.log(1 + np.exp(min_x - max_x))
156 | 
157 | def find_policy(n_states, r, n_actions, discount,
158 |                            transition_probability):
159 |     """
160 |     Find a policy with linear value iteration. Based on the code accompanying
161 |     the Levine et al. GPIRL paper and on Ziebart's PhD thesis (algorithm 9.1).
162 | 
163 |     n_states: Number of states N. int.
164 |     r: Reward. NumPy array with shape (N,).
165 |     n_actions: Number of actions A. int.
166 |     discount: Discount factor of the MDP. float.
167 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
168 |         the probability of transitioning from state_i to state_k under action.
169 |         Shape (N, A, N).
170 |     -> NumPy array of states and the probability of taking each action in that
171 |         state, with shape (N, A).
172 |     """
173 | 
174 |     # V = value_iteration.value(n_states, transition_probability, r, discount)
175 | 
176 |     # NumPy's dot really dislikes using inf, so I'm making everything finite
177 |     # using nan_to_num.
178 |     V = np.nan_to_num(np.ones((n_states, 1)) * float("-inf"))
179 | 
180 |     diff = np.ones((n_states,))
181 |     while (diff > 1e-4).all():  # Iterate until convergence.
182 |         new_V = r.copy()
183 |         for j in range(n_actions):
184 |             for i in range(n_states):
185 |                 new_V[i] = softmax(new_V[i], r[i] + discount*
186 |                     np.sum(transition_probability[i, j, k] * V[k]
187 |                            for k in range(n_states)))
188 | 
189 |         # # This seems to diverge, so we z-score it (engineering hack).
190 |         new_V = (new_V - new_V.mean())/new_V.std()
191 | 
192 |         diff = abs(V - new_V)
193 |         V = new_V
194 | 
195 |     # We really want Q, not V, so grab that using equation 9.2 from the thesis.
196 |     Q = np.zeros((n_states, n_actions))
197 |     for i in range(n_states):
198 |         for j in range(n_actions):
199 |             p = np.array([transition_probability[i, j, k]
200 |                           for k in range(n_states)])
201 |             Q[i, j] = p.dot(r + discount*V)
202 | 
203 |     # Softmax by row to interpret these values as probabilities.
204 |     Q -= Q.max(axis=1).reshape((n_states, 1))  # For numerical stability.
205 |     Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1))
206 |     return Q
207 | 
208 | def expected_value_difference(n_states, n_actions, transition_probability,
209 |     reward, discount, p_start_state, optimal_value, true_reward):
210 |     """
211 |     Calculate the expected value difference, which is a proxy to how good a
212 |     recovered reward function is.
213 | 
214 |     n_states: Number of states. int.
215 |     n_actions: Number of actions. int.
216 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
217 |         the probability of transitioning from state_i to state_k under action.
218 |         Shape (N, A, N).
219 |     reward: Reward vector mapping state int to reward. Shape (N,).
220 |     discount: Discount factor. float.
221 |     p_start_state: Probability vector with the ith component as the probability
222 |         that the ith state is the start state. Shape (N,).
223 |     optimal_value: Value vector for the ground reward with optimal policy.
224 |         The ith component is the value of the ith state. Shape (N,).
225 |     true_reward: True reward vector. Shape (N,).
226 |     -> Expected value difference. float.
227 |     """
228 | 
229 |     policy = value_iteration.find_policy(n_states, n_actions,
230 |         transition_probability, reward, discount)
231 |     value = value_iteration.value(policy.argmax(axis=1), n_states,
232 |         transition_probability, true_reward, discount)
233 | 
234 |     evd = optimal_value.dot(p_start_state) - value.dot(p_start_state)
235 |     return evd
236 | 


--------------------------------------------------------------------------------
/irl/mdp/gridworld.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements the gridworld MDP.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | import numpy as np
  9 | import numpy.random as rn
 10 | 
 11 | class Gridworld(object):
 12 |     """
 13 |     Gridworld MDP.
 14 |     """
 15 | 
 16 |     def __init__(self, grid_size, wind, discount):
 17 |         """
 18 |         grid_size: Grid size. int.
 19 |         wind: Chance of moving randomly. float.
 20 |         discount: MDP discount. float.
 21 |         -> Gridworld
 22 |         """
 23 | 
 24 |         self.actions = ((1, 0), (0, 1), (-1, 0), (0, -1))
 25 |         self.n_actions = len(self.actions)
 26 |         self.n_states = grid_size**2
 27 |         self.grid_size = grid_size
 28 |         self.wind = wind
 29 |         self.discount = discount
 30 | 
 31 |         # Preconstruct the transition probability array.
 32 |         self.transition_probability = np.array(
 33 |             [[[self._transition_probability(i, j, k)
 34 |                for k in range(self.n_states)]
 35 |               for j in range(self.n_actions)]
 36 |              for i in range(self.n_states)])
 37 | 
 38 |     def __str__(self):
 39 |         return "Gridworld({}, {}, {})".format(self.grid_size, self.wind,
 40 |                                               self.discount)
 41 | 
 42 |     def feature_vector(self, i, feature_map="ident"):
 43 |         """
 44 |         Get the feature vector associated with a state integer.
 45 | 
 46 |         i: State int.
 47 |         feature_map: Which feature map to use (default ident). String in {ident,
 48 |             coord, proxi}.
 49 |         -> Feature vector.
 50 |         """
 51 | 
 52 |         if feature_map == "coord":
 53 |             f = np.zeros(self.grid_size)
 54 |             x, y = i % self.grid_size, i // self.grid_size
 55 |             f[x] += 1
 56 |             f[y] += 1
 57 |             return f
 58 |         if feature_map == "proxi":
 59 |             f = np.zeros(self.n_states)
 60 |             x, y = i % self.grid_size, i // self.grid_size
 61 |             for b in range(self.grid_size):
 62 |                 for a in range(self.grid_size):
 63 |                     dist = abs(x - a) + abs(y - b)
 64 |                     f[self.point_to_int((a, b))] = dist
 65 |             return f
 66 |         # Assume identity map.
 67 |         f = np.zeros(self.n_states)
 68 |         f[i] = 1
 69 |         return f
 70 | 
 71 |     def feature_matrix(self, feature_map="ident"):
 72 |         """
 73 |         Get the feature matrix for this gridworld.
 74 | 
 75 |         feature_map: Which feature map to use (default ident). String in {ident,
 76 |             coord, proxi}.
 77 |         -> NumPy array with shape (n_states, d_states).
 78 |         """
 79 | 
 80 |         features = []
 81 |         for n in range(self.n_states):
 82 |             f = self.feature_vector(n, feature_map)
 83 |             features.append(f)
 84 |         return np.array(features)
 85 | 
 86 |     def int_to_point(self, i):
 87 |         """
 88 |         Convert a state int into the corresponding coordinate.
 89 | 
 90 |         i: State int.
 91 |         -> (x, y) int tuple.
 92 |         """
 93 | 
 94 |         return (i % self.grid_size, i // self.grid_size)
 95 | 
 96 |     def point_to_int(self, p):
 97 |         """
 98 |         Convert a coordinate into the corresponding state int.
 99 | 
100 |         p: (x, y) tuple.
101 |         -> State int.
102 |         """
103 | 
104 |         return p[0] + p[1]*self.grid_size
105 | 
106 |     def neighbouring(self, i, k):
107 |         """
108 |         Get whether two points neighbour each other. Also returns true if they
109 |         are the same point.
110 | 
111 |         i: (x, y) int tuple.
112 |         k: (x, y) int tuple.
113 |         -> bool.
114 |         """
115 | 
116 |         return abs(i[0] - k[0]) + abs(i[1] - k[1]) <= 1
117 | 
118 |     def _transition_probability(self, i, j, k):
119 |         """
120 |         Get the probability of transitioning from state i to state k given
121 |         action j.
122 | 
123 |         i: State int.
124 |         j: Action int.
125 |         k: State int.
126 |         -> p(s_k | s_i, a_j)
127 |         """
128 | 
129 |         xi, yi = self.int_to_point(i)
130 |         xj, yj = self.actions[j]
131 |         xk, yk = self.int_to_point(k)
132 | 
133 |         if not self.neighbouring((xi, yi), (xk, yk)):
134 |             return 0.0
135 | 
136 |         # Is k the intended state to move to?
137 |         if (xi + xj, yi + yj) == (xk, yk):
138 |             return 1 - self.wind + self.wind/self.n_actions
139 | 
140 |         # If these are not the same point, then we can move there by wind.
141 |         if (xi, yi) != (xk, yk):
142 |             return self.wind/self.n_actions
143 | 
144 |         # If these are the same point, we can only move here by either moving
145 |         # off the grid or being blown off the grid. Are we on a corner or not?
146 |         if (xi, yi) in {(0, 0), (self.grid_size-1, self.grid_size-1),
147 |                         (0, self.grid_size-1), (self.grid_size-1, 0)}:
148 |             # Corner.
149 |             # Can move off the edge in two directions.
150 |             # Did we intend to move off the grid?
151 |             if not (0 <= xi + xj < self.grid_size and
152 |                     0 <= yi + yj < self.grid_size):
153 |                 # We intended to move off the grid, so we have the regular
154 |                 # success chance of staying here plus an extra chance of blowing
155 |                 # onto the *other* off-grid square.
156 |                 return 1 - self.wind + 2*self.wind/self.n_actions
157 |             else:
158 |                 # We can blow off the grid in either direction only by wind.
159 |                 return 2*self.wind/self.n_actions
160 |         else:
161 |             # Not a corner. Is it an edge?
162 |             if (xi not in {0, self.grid_size-1} and
163 |                 yi not in {0, self.grid_size-1}):
164 |                 # Not an edge.
165 |                 return 0.0
166 | 
167 |             # Edge.
168 |             # Can only move off the edge in one direction.
169 |             # Did we intend to move off the grid?
170 |             if not (0 <= xi + xj < self.grid_size and
171 |                     0 <= yi + yj < self.grid_size):
172 |                 # We intended to move off the grid, so we have the regular
173 |                 # success chance of staying here.
174 |                 return 1 - self.wind + self.wind/self.n_actions
175 |             else:
176 |                 # We can blow off the grid only by wind.
177 |                 return self.wind/self.n_actions
178 | 
179 |     def reward(self, state_int):
180 |         """
181 |         Reward for being in state state_int.
182 | 
183 |         state_int: State integer. int.
184 |         -> Reward.
185 |         """
186 | 
187 |         if state_int == self.n_states - 1:
188 |             return 1
189 |         return 0
190 | 
191 |     def average_reward(self, n_trajectories, trajectory_length, policy):
192 |         """
193 |         Calculate the average total reward obtained by following a given policy
194 |         over n_paths paths.
195 | 
196 |         policy: Map from state integers to action integers.
197 |         n_trajectories: Number of trajectories. int.
198 |         trajectory_length: Length of an episode. int.
199 |         -> Average reward, standard deviation.
200 |         """
201 | 
202 |         trajectories = self.generate_trajectories(n_trajectories,
203 |                                              trajectory_length, policy)
204 |         rewards = [[r for _, _, r in trajectory] for trajectory in trajectories]
205 |         rewards = np.array(rewards)
206 | 
207 |         # Add up all the rewards to find the total reward.
208 |         total_reward = rewards.sum(axis=1)
209 | 
210 |         # Return the average reward and standard deviation.
211 |         return total_reward.mean(), total_reward.std()
212 | 
213 |     def optimal_policy(self, state_int):
214 |         """
215 |         The optimal policy for this gridworld.
216 | 
217 |         state_int: What state we are in. int.
218 |         -> Action int.
219 |         """
220 | 
221 |         sx, sy = self.int_to_point(state_int)
222 | 
223 |         if sx < self.grid_size and sy < self.grid_size:
224 |             return rn.randint(0, 2)
225 |         if sx < self.grid_size-1:
226 |             return 0
227 |         if sy < self.grid_size-1:
228 |             return 1
229 |         raise ValueError("Unexpected state.")
230 | 
231 |     def optimal_policy_deterministic(self, state_int):
232 |         """
233 |         Deterministic version of the optimal policy for this gridworld.
234 | 
235 |         state_int: What state we are in. int.
236 |         -> Action int.
237 |         """
238 | 
239 |         sx, sy = self.int_to_point(state_int)
240 |         if sx < sy:
241 |             return 0
242 |         return 1
243 | 
244 |     def generate_trajectories(self, n_trajectories, trajectory_length, policy,
245 |                                     random_start=False):
246 |         """
247 |         Generate n_trajectories trajectories with length trajectory_length,
248 |         following the given policy.
249 | 
250 |         n_trajectories: Number of trajectories. int.
251 |         trajectory_length: Length of an episode. int.
252 |         policy: Map from state integers to action integers.
253 |         random_start: Whether to start randomly (default False). bool.
254 |         -> [[(state int, action int, reward float)]]
255 |         """
256 | 
257 |         trajectories = []
258 |         for _ in range(n_trajectories):
259 |             if random_start:
260 |                 sx, sy = rn.randint(self.grid_size), rn.randint(self.grid_size)
261 |             else:
262 |                 sx, sy = 0, 0
263 | 
264 |             trajectory = []
265 |             for _ in range(trajectory_length):
266 |                 if rn.random() < self.wind:
267 |                     action = self.actions[rn.randint(0, 4)]
268 |                 else:
269 |                     # Follow the given policy.
270 |                     action = self.actions[policy(self.point_to_int((sx, sy)))]
271 | 
272 |                 if (0 <= sx + action[0] < self.grid_size and
273 |                         0 <= sy + action[1] < self.grid_size):
274 |                     next_sx = sx + action[0]
275 |                     next_sy = sy + action[1]
276 |                 else:
277 |                     next_sx = sx
278 |                     next_sy = sy
279 | 
280 |                 state_int = self.point_to_int((sx, sy))
281 |                 action_int = self.actions.index(action)
282 |                 next_state_int = self.point_to_int((next_sx, next_sy))
283 |                 reward = self.reward(next_state_int)
284 |                 trajectory.append((state_int, action_int, reward))
285 | 
286 |                 sx = next_sx
287 |                 sy = next_sy
288 | 
289 |             trajectories.append(trajectory)
290 | 
291 |         return np.array(trajectories)
292 | 


--------------------------------------------------------------------------------
/irl/deep_maxent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements deep maximum entropy inverse reinforcement learning based on
  3 | Ziebart et al., 2008 and Wulfmeier et al., 2015, using symbolic methods with
  4 | Theano.
  5 | 
  6 | Matthew Alger, 2015
  7 | matthew.alger@anu.edu.au
  8 | """
  9 | 
 10 | from itertools import product
 11 | 
 12 | import numpy as np
 13 | import numpy.random as rn
 14 | import theano as th
 15 | import theano.tensor as T
 16 | 
 17 | from . import maxent
 18 | 
 19 | FLOAT = th.config.floatX
 20 | 
 21 | def find_svf(n_states, trajectories):
 22 |     """
 23 |     Find the state vistiation frequency from trajectories.
 24 | 
 25 |     n_states: Number of states. int.
 26 |     trajectories: 3D array of state/action pairs. States are ints, actions
 27 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
 28 |         trajectories and L is the trajectory length.
 29 |     -> State visitation frequencies vector with shape (N,).
 30 |     """
 31 | 
 32 |     svf = np.zeros(n_states)
 33 | 
 34 |     for trajectory in trajectories:
 35 |         for state, _, _ in trajectory:
 36 |             svf[state] += 1
 37 | 
 38 |     svf /= trajectories.shape[0]
 39 | 
 40 |     return th.shared(svf, "svf", borrow=True)
 41 | 
 42 | def optimal_value(n_states, n_actions, transition_probabilities, reward,
 43 |                   discount, threshold=1e-2):
 44 |     """
 45 |     Find the optimal value function.
 46 | 
 47 |     n_states: Number of states. int.
 48 |     n_actions: Number of actions. int.
 49 |     transition_probabilities: Function taking (state, action, state) to
 50 |         transition probabilities.
 51 |     reward: Vector of rewards for each state.
 52 |     discount: MDP discount factor. float.
 53 |     threshold: Convergence threshold, default 1e-2. float.
 54 |     -> Array of values for each state
 55 |     """
 56 | 
 57 |     v = T.zeros(n_states, dtype=FLOAT)
 58 | 
 59 |     def update(s, prev_diff, v, reward, tps):
 60 |         max_v = float("-inf")
 61 |         v_template = T.zeros_like(v)
 62 |         for a in range(n_actions):
 63 |             tp = tps[s, a, :]
 64 |             max_v = T.largest(max_v, T.dot(tp, reward + discount*v))
 65 |         new_diff = abs(v[s] - max_v)
 66 |         if T.lt(prev_diff, new_diff):
 67 |             diff = new_diff
 68 |         else:
 69 |             diff = prev_diff
 70 |         return (diff, T.set_subtensor(v_template[s], max_v)), {}
 71 | 
 72 |     def until_converged(diff, v):
 73 |         (diff, vs), _ = th.scan(
 74 |                 fn=update,
 75 |                 outputs_info=[{"initial": diff, "taps": [-1]},
 76 |                               None],
 77 |                 sequences=[T.arange(n_states)],
 78 |                 non_sequences=[v, reward, transition_probabilities])
 79 |         return ((diff[-1], vs.sum(axis=0)), {},
 80 |                 th.scan_module.until(diff[-1] < threshold))
 81 | 
 82 |     (_, vs), _ = th.scan(fn = until_converged,
 83 |                          outputs_info=[
 84 |                             # Need to force an inf into the right Theano
 85 |                             # data type and this seems to be the only way that
 86 |                             # works.
 87 |                             {"initial": getattr(np, FLOAT)(float("inf")),
 88 |                              "taps": [-1]},
 89 |                             {"initial": v,
 90 |                              "taps": [-1]}],
 91 |                          n_steps=1000)
 92 | 
 93 |     return vs[-1]
 94 | 
 95 | def find_policy(n_states, n_actions, transition_probabilities, reward, discount,
 96 |                 threshold=1e-2, v=None):
 97 |     """
 98 |     Find the optimal policy.
 99 | 
100 |     n_states: Number of states. int.
101 |     n_actions: Number of actions. int.
102 |     transition_probabilities: Function taking (state, action, state) to
103 |         transition probabilities.
104 |     reward: Vector of rewards for each state.
105 |     discount: MDP discount factor. float.
106 |     threshold: Convergence threshold, default 1e-2. float.
107 |     v: Optimal value array (if known). Default None.
108 |     -> Action probabilities for each state.
109 |     """
110 | 
111 |     if v is None:
112 |         v = optimal_value(n_states, n_actions, transition_probabilities, reward,
113 |                           discount, threshold)
114 | 
115 |     # Get Q using equation 9.2 from Ziebart's thesis.
116 |     Q = T.zeros((n_states, n_actions))
117 |     def make_Q(i, j, tps, Q, reward, v):
118 |         Q_template = T.zeros_like(Q)
119 |         tp = transition_probabilities[i, j, :]
120 |         return T.set_subtensor(Q_template[i, j], tp.dot(reward + discount*v)),{}
121 | 
122 |     prod = np.array(list(product(range(n_states), range(n_actions))))
123 |     state_range = th.shared(prod[:, 0])
124 |     action_range = th.shared(prod[:, 1])
125 |     Qs, _ = th.scan(fn=make_Q,
126 |                     outputs_info=None,
127 |                     sequences=[state_range, action_range],
128 |                     non_sequences=[transition_probabilities, Q, reward, v])
129 |     Q = Qs.sum(axis=0)
130 |     Q -= Q.max(axis=1).reshape((n_states, 1))  # For numerical stability.
131 |     Q = T.exp(Q)/T.exp(Q).sum(axis=1).reshape((n_states, 1))
132 |     return Q
133 | 
134 | def find_expected_svf(n_states, r, n_actions, discount,
135 |                       transition_probability, trajectories):
136 |     """
137 |     Find the expected state visitation frequencies using algorithm 1 from
138 |     Ziebart et al. 2008.
139 | 
140 |     n_states: Number of states N. int.
141 |     alpha: Reward. NumPy array with shape (N,).
142 |     n_actions: Number of actions A. int.
143 |     discount: Discount factor of the MDP. float.
144 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
145 |         the probability of transitioning from state_i to state_k under action.
146 |         Shape (N, A, N).
147 |     trajectories: 3D array of state/action pairs. States are ints, actions
148 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
149 |         trajectories and L is the trajectory length.
150 |     -> Expected state visitation frequencies vector with shape (N,).
151 |     """
152 | 
153 |     n_trajectories = trajectories.shape[0]
154 |     trajectory_length = trajectories.shape[1]
155 | 
156 |     policy = find_policy(n_states, n_actions,
157 |                          transition_probability, r, discount)
158 | 
159 |     start_state_count = T.extra_ops.bincount(trajectories[:, 0, 0],
160 |                                              minlength=n_states)
161 |     p_start_state = start_state_count.astype(FLOAT)/n_trajectories
162 | 
163 |     def state_visitation_step(i, j, prev_svf, policy, tps):
164 |         """
165 |         The sum of the outputs of a scan over this will be a row of the svf.
166 |         """
167 | 
168 |         svf = prev_svf[i] * policy[i, j] * tps[i, j, :]
169 |         return svf, {}
170 | 
171 |     prod = np.array(list(product(range(n_states), range(n_actions))))
172 |     state_range = th.shared(prod[:, 0])
173 |     action_range = th.shared(prod[:, 1])
174 |     def state_visitation_row(prev_svf, policy, tps, state_range, action_range):
175 |         svf_t, _ = th.scan(fn=state_visitation_step,
176 |                            sequences=[state_range, action_range],
177 |                            non_sequences=[prev_svf, policy, tps])
178 |         svf_t = svf_t.sum(axis=0)
179 |         return svf_t, {}
180 | 
181 |     svf, _ = th.scan(fn=state_visitation_row,
182 |                      outputs_info=[{"initial": p_start_state, "taps": [-1]}],
183 |                      n_steps=trajectories.shape[1]-1,
184 |                      non_sequences=[policy, transition_probability, state_range,
185 |                                  action_range])
186 | 
187 |     return svf.sum(axis=0) + p_start_state
188 | 
189 | def irl(structure, feature_matrix, n_actions, discount, transition_probability,
190 |         trajectories, epochs, learning_rate, initialisation="normal", l1=0.1,
191 |         l2=0.1):
192 |     """
193 |     Find the reward function for the given trajectories.
194 | 
195 |     structure: Neural network structure tuple, e.g. (10, 3, 3) would be a
196 |         3-layer neural network with 10 inputs.
197 |     feature_matrix: Matrix with the nth row representing the nth state. NumPy
198 |         array with shape (N, D) where N is the number of states and D is the
199 |         dimensionality of the state.
200 |     n_actions: Number of actions A. int.
201 |     discount: Discount factor of the MDP. float.
202 |     transition_probability: NumPy array mapping (state_i, action, state_k) to
203 |         the probability of transitioning from state_i to state_k under action.
204 |         Shape (N, A, N).
205 |     trajectories: 3D array of state/action pairs. States are ints, actions
206 |         are ints. NumPy array with shape (T, L, 2) where T is the number of
207 |         trajectories and L is the trajectory length.
208 |     epochs: Number of gradient descent steps. int.
209 |     learning_rate: Gradient descent learning rate. float.
210 |     initialisation: What distribution to use. str in {normal, uniform}. Default
211 |         normal.
212 |     l1: L1 regularisation. Default 0.1. float.
213 |     l2: L2 regularisation. Default 0.1. float.
214 |     -> Reward vector with shape (N,).
215 |     """
216 | 
217 |     n_states, d_states = feature_matrix.shape
218 |     transition_probability = th.shared(transition_probability, borrow=True)
219 |     trajectories = th.shared(trajectories, borrow=True)
220 | 
221 |     # Initialise W matrices; b biases.
222 |     n_layers = len(structure)-1
223 |     weights = []
224 |     hist_w_grads = []  # For AdaGrad.
225 |     biases = []
226 |     hist_b_grads = []  # For AdaGrad.
227 |     for i in range(n_layers):
228 |         # W
229 |         shape = (structure[i+1], structure[i])
230 |         if initialisation == "normal":
231 |             matrix = th.shared(rn.normal(size=shape), name="W", borrow=True)
232 |         else:
233 |             matrix = th.shared(rn.uniform(size=shape), name="W", borrow=True)
234 |         weights.append(matrix)
235 |         hist_w_grads.append(th.shared(np.zeros(shape), name="hdW", borrow=True))
236 | 
237 |         # b
238 |         shape = (structure[i+1], 1)
239 |         if initialisation == "normal":
240 |             matrix = th.shared(rn.normal(size=shape), name="b", borrow=True)
241 |         else:
242 |             matrix = th.shared(rn.uniform(size=shape), name="b", borrow=True)
243 |         biases.append(matrix)
244 |         hist_b_grads.append(th.shared(np.zeros(shape), name="hdb", borrow=True))
245 | 
246 |     # Initialise α weight, β bias.
247 |     if initialisation == "normal":
248 |         α = th.shared(rn.normal(size=(1, structure[-1])), name="alpha",
249 |                       borrow=True)
250 |     else:
251 |         α = th.shared(rn.uniform(size=(1, structure[-1])), name="alpha",
252 |                       borrow=True)
253 |     hist_α_grad = T.zeros(α.shape)  # For AdaGrad.
254 | 
255 |     adagrad_epsilon = 1e-6  # AdaGrad numerical stability.
256 | 
257 |     #### Theano symbolic setup. ####
258 | 
259 |     # Symbolic input.
260 |     s_feature_matrix = T.matrix("x")
261 |     # Feature matrices.
262 |     # All dimensions of the form (d_layer, n_states).
263 |     φs = [s_feature_matrix.T]
264 |     # Forward propagation.
265 |     for W, b in zip(weights, biases):
266 |         φ = T.nnet.sigmoid(th.compile.ops.Rebroadcast((0, False), (1, True))(b)
267 |                            + W.dot(φs[-1]))
268 |         φs.append(φ)
269 |         # φs[1] = φ1 etc.
270 |     # Reward.
271 |     r = α.dot(φs[-1]).reshape((n_states,))
272 |     # Engineering hack: z-score the reward.
273 |     r = (r - r.mean())/r.std()
274 |     # Associated feature expectations.
275 |     expected_svf = find_expected_svf(n_states, r,
276 |                                      n_actions, discount,
277 |                                      transition_probability,
278 |                                      trajectories)
279 |     svf = maxent.find_svf(n_states, trajectories.get_value())
280 |     # Derivatives (backward propagation).
281 |     updates = []
282 |     α_grad = φs[-1].dot(svf - expected_svf).T
283 |     hist_α_grad += α_grad**2
284 |     adj_α_grad = α_grad/(adagrad_epsilon + T.sqrt(hist_α_grad))
285 |     updates.append((α, α + adj_α_grad*learning_rate))
286 | 
287 |     def grad_for_state(s, theta, svf_diff, r):
288 |         """
289 |         Calculate the gradient with respect to theta for one state.
290 |         """
291 | 
292 |         regularisation = abs(theta).sum()*l1 + (theta**2).sum()*l2
293 |         return svf_diff[s] * T.grad(r[s], theta) - regularisation, {}
294 | 
295 |     for i, W in enumerate(weights):
296 |         w_grads, _ = th.scan(fn=grad_for_state,
297 |                              sequences=[T.arange(n_states)],
298 |                              non_sequences=[W, svf - expected_svf, r])
299 |         w_grad = w_grads.sum(axis=0)
300 |         hist_w_grads[i] += w_grad**2
301 |         adj_w_grad = w_grad/(adagrad_epsilon + T.sqrt(hist_w_grads[i]))
302 |         updates.append((W, W + adj_w_grad*learning_rate))
303 |     for i, b in enumerate(biases):
304 |         b_grads, _ = th.scan(fn=grad_for_state,
305 |                              sequences=[T.arange(n_states)],
306 |                              non_sequences=[b, svf - expected_svf, r])
307 |         b_grad = b_grads.sum(axis=0)
308 |         hist_b_grads[i] += b_grad**2
309 |         adj_b_grad = b_grad/(adagrad_epsilon + T.sqrt(hist_b_grads[i]))
310 |         updates.append((b, b + adj_b_grad*learning_rate))
311 | 
312 |     train = th.function([s_feature_matrix], updates=updates, outputs=r)
313 |     run = th.function([s_feature_matrix], outputs=r)
314 | 
315 |     for e in range(epochs):
316 |         reward = train(feature_matrix)
317 | 
318 |     return reward.reshape((n_states,))
319 | 


--------------------------------------------------------------------------------
/examples/experiments.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Perform the experiments from the report.
  3 | 
  4 | Matthew Alger, 2015
  5 | matthew.alger@anu.edu.au
  6 | """
  7 | 
  8 | from time import time
  9 | from sys import stdout
 10 | 
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | from irl import maxent
 15 | from irl import deep_maxent
 16 | from irl import value_iteration
 17 | from irl.mdp.gridworld import Gridworld
 18 | from irl.mdp.objectworld import Objectworld
 19 | 
 20 | def test_gw_once(grid_size, feature_map, n_samples, epochs, structure):
 21 |     """
 22 |     Test MaxEnt and DeepMaxEnt on a gw of size grid_size with the feature
 23 |     map feature_map with n_samples paths.
 24 | 
 25 |     grid_size: Grid size. int.
 26 |     feature_map: Which feature map to use. String in {ident, coord, proxi}.
 27 |     n_samples: Number of paths to sample.
 28 |     epochs: Number of epochs to run MaxEnt with.
 29 |     structure: Neural network structure tuple, e.g. (3, 3) would be a
 30 |         3-layer neural network with assumed inputs.
 31 |     -> Expected value difference for MaxEnt, DeepMaxEnt
 32 |     """
 33 | 
 34 |     # Basic gist of what we're doing here: Get the reward function using our
 35 |     # different IRL methods, use those to get a policy, evaluate that policy
 36 |     # using the true reward, and then return the difference in expected values.
 37 | 
 38 |     # Setup parameters.
 39 |     wind = 0.3
 40 |     discount = 0.9
 41 |     learning_rate = 0.01
 42 |     trajectory_length = 3*grid_size
 43 | 
 44 |     # Make the gridworld and associated data.
 45 |     gw = Gridworld(grid_size, wind, discount)
 46 |     feature_matrix = gw.feature_matrix(feature_map)
 47 |     ground_reward = np.array([gw.reward(i) for i in range(gw.n_states)])
 48 |     optimal_policy = value_iteration.find_policy(gw.n_states,
 49 |                                                  gw.n_actions,
 50 |                                                  gw.transition_probability,
 51 |                                                  ground_reward,
 52 |                                                  discount).argmax(axis=1)
 53 |     trajectories = gw.generate_trajectories(n_samples,
 54 |                                             trajectory_length,
 55 |                                             optimal_policy.take)
 56 |     p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
 57 |                      trajectories.shape[0])
 58 | 
 59 |     # True value.
 60 |     optimal_V = value_iteration.optimal_value(gw.n_states,
 61 |                                               gw.n_actions,
 62 |                                               gw.transition_probability,
 63 |                                               ground_reward, gw.discount)
 64 | 
 65 |     # MaxEnt reward; policy; value.
 66 |     maxent_reward = deep_maxent.irl((feature_matrix.shape[1],),
 67 |                                     feature_matrix,
 68 |                                     gw.n_actions,
 69 |                                     gw.discount,
 70 |                                     gw.transition_probability,
 71 |                                     trajectories, epochs, learning_rate)
 72 | 
 73 |     maxent_policy = value_iteration.find_policy(gw.n_states,
 74 |                                                 gw.n_actions,
 75 |                                                 gw.transition_probability,
 76 |                                                 maxent_reward,
 77 |                                                 discount).argmax(axis=1)
 78 |     maxent_V = value_iteration.value(maxent_policy,
 79 |                                      gw.n_states,
 80 |                                      gw.transition_probability,
 81 |                                      ground_reward,
 82 |                                      gw.discount)
 83 |     maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)
 84 | 
 85 |     # DeepMaxEnt reward; policy; value.
 86 |     deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure,
 87 |                                          feature_matrix,
 88 |                                          gw.n_actions,
 89 |                                          gw.discount,
 90 |                                          gw.transition_probability,
 91 |                                          trajectories, epochs, learning_rate)
 92 |     deep_maxent_policy = value_iteration.find_policy(gw.n_states,
 93 |                                                      gw.n_actions,
 94 |                                                      gw.transition_probability,
 95 |                                                      deep_maxent_reward,
 96 |                                                      discount).argmax(axis=1)
 97 |     deep_maxent_V = value_iteration.value(deep_maxent_policy,
 98 |                                           gw.n_states,
 99 |                                           gw.transition_probability,
100 |                                           ground_reward,
101 |                                           gw.discount)
102 |     deep_maxent_EVD = (optimal_V.dot(p_start_state) -
103 |                        deep_maxent_V.dot(p_start_state))
104 | 
105 |     plt.subplot(3, 3, 1)
106 |     plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
107 |     plt.title("Groundtruth reward")
108 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
109 |                     bottom=False, top=False, left=False, right=False,
110 |                     labelright=False)
111 |     plt.subplot(3, 3, 2)
112 |     plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
113 |     plt.title("MaxEnt reward")
114 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
115 |                     bottom=False, top=False, left=False, right=False,
116 |                     labelright=False)
117 |     plt.subplot(3, 3, 3)
118 |     plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
119 |     plt.title("DeepMaxEnt reward")
120 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
121 |                     bottom=False, top=False, left=False, right=False,
122 |                     labelright=False)
123 | 
124 |     plt.subplot(3, 3, 4)
125 |     plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
126 |     plt.title("Optimal policy")
127 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
128 |                     bottom=False, top=False, left=False, right=False,
129 |                     labelright=False)
130 |     plt.subplot(3, 3, 5)
131 |     plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
132 |     plt.title("MaxEnt policy")
133 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
134 |                     bottom=False, top=False, left=False, right=False,
135 |                     labelright=False)
136 |     plt.subplot(3, 3, 6)
137 |     plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
138 |                vmin=0, vmax=3)
139 |     plt.title("DeepMaxEnt policy")
140 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
141 |                     bottom=False, top=False, left=False, right=False,
142 |                     labelright=False)
143 | 
144 |     plt.subplot(3, 3, 7)
145 |     plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
146 |     plt.title("Optimal value")
147 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
148 |                     bottom=False, top=False, left=False, right=False,
149 |                     labelright=False)
150 |     plt.subplot(3, 3, 8)
151 |     plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
152 |     plt.title("MaxEnt value")
153 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
154 |                     bottom=False, top=False, left=False, right=False,
155 |                     labelright=False)
156 |     plt.subplot(3, 3, 9)
157 |     plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
158 |     plt.title("DeepMaxEnt value")
159 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
160 |                     bottom=False, top=False, left=False, right=False,
161 |                     labelright=False)
162 |     plt.savefig("{}_{}_{}_{}gridworld{}.png".format(grid_size, feature_map,
163 |         n_samples, epochs, structure, np.random.randint(10000000)))
164 | 
165 | 
166 |     return maxent_EVD, deep_maxent_EVD
167 | 
168 | def test_ow_once(grid_size, n_objects, n_colours, discrete, l1, l2, n_samples,
169 |                  epochs, structure):
170 |     """
171 |     Test MaxEnt and DeepMaxEnt on a ow of size grid_size with the feature
172 |     map feature_map with n_samples paths.
173 | 
174 |     grid_size: Grid size. int.
175 |     n_objects: Number of objects. int.
176 |     n_colours: Number of colours. int.
177 |     discrete: Whether the features should be discrete. bool.
178 |     l1: L1 regularisation. float.
179 |     l2: L2 regularisation. float.
180 |     n_samples: Number of paths to sample.
181 |     epochs: Number of epochs to run MaxEnt with.
182 |     structure: Neural network structure tuple, e.g. (3, 3) would be a
183 |         3-layer neural network with assumed inputs.
184 |     -> Expected value difference for MaxEnt, DeepMaxEnt
185 |     """
186 | 
187 |     # Basic gist of what we're doing here: Get the reward function using our
188 |     # different IRL methods, use those to get a policy, evaluate that policy
189 |     # using the true reward, and then return the difference in expected values.
190 | 
191 |     # Setup parameters.
192 |     wind = 0.3
193 |     discount = 0.9
194 |     learning_rate = 0.01
195 |     trajectory_length = 3*grid_size
196 | 
197 |     # Make the objectworld and associated data.
198 |     ow = Objectworld(grid_size, n_objects, n_colours, wind, discount)
199 |     feature_matrix = ow.feature_matrix(discrete)
200 |     ground_reward = np.array([ow.reward(i) for i in range(ow.n_states)])
201 |     optimal_policy = value_iteration.find_policy(ow.n_states,
202 |                                                  ow.n_actions,
203 |                                                  ow.transition_probability,
204 |                                                  ground_reward,
205 |                                                  discount).argmax(axis=1)
206 |     trajectories = ow.generate_trajectories(n_samples,
207 |                                             trajectory_length,
208 |                                             optimal_policy.take)
209 |     p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
210 |                      trajectories.shape[0])
211 | 
212 |     # True value.
213 |     optimal_V = value_iteration.optimal_value(ow.n_states,
214 |                                               ow.n_actions,
215 |                                               ow.transition_probability,
216 |                                               ground_reward, ow.discount)
217 | 
218 |     # MaxEnt reward; policy; value.
219 |     maxent_reward = deep_maxent.irl((feature_matrix.shape[1],),
220 |                                     feature_matrix,
221 |                                     ow.n_actions,
222 |                                     ow.discount,
223 |                                     ow.transition_probability,
224 |                                     trajectories, epochs, learning_rate,
225 |                                     l1=l1, l2=l2)
226 | 
227 |     maxent_policy = value_iteration.find_policy(ow.n_states,
228 |                                                 ow.n_actions,
229 |                                                 ow.transition_probability,
230 |                                                 maxent_reward,
231 |                                                 discount).argmax(axis=1)
232 |     maxent_V = value_iteration.value(maxent_policy,
233 |                                      ow.n_states,
234 |                                      ow.transition_probability,
235 |                                      ground_reward,
236 |                                      ow.discount)
237 |     maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)
238 | 
239 |     # DeepMaxEnt reward; policy; value.
240 |     deep_learning_rate = 0.005 # For the 32 x 32 experiments.
241 |     deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure,
242 |                                          feature_matrix,
243 |                                          ow.n_actions,
244 |                                          ow.discount,
245 |                                          ow.transition_probability,
246 |                                          trajectories, epochs,
247 |                                          deep_learning_rate,
248 |                                          l1=l1, l2=l2)
249 | 
250 |     deep_maxent_policy = value_iteration.find_policy(ow.n_states,
251 |                                                      ow.n_actions,
252 |                                                      ow.transition_probability,
253 |                                                      deep_maxent_reward,
254 |                                                      discount).argmax(axis=1)
255 |     deep_maxent_V = value_iteration.value(deep_maxent_policy,
256 |                                           ow.n_states,
257 |                                           ow.transition_probability,
258 |                                           ground_reward,
259 |                                           ow.discount)
260 | 
261 |     deep_maxent_EVD = (optimal_V.dot(p_start_state) -
262 |                        deep_maxent_V.dot(p_start_state))
263 | 
264 |     plt.subplot(3, 3, 1)
265 |     plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
266 |     plt.title("Groundtruth reward")
267 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
268 |         bottom=False, top=False, left=False, right=False, labelright=False)
269 |     plt.subplot(3, 3, 2)
270 |     plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
271 |     plt.title("MaxEnt reward")
272 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
273 |         bottom=False, top=False, left=False, right=False, labelright=False)
274 |     plt.subplot(3, 3, 3)
275 |     plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
276 |     plt.title("DeepMaxEnt reward")
277 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
278 |         bottom=False, top=False, left=False, right=False, labelright=False)
279 | 
280 |     plt.subplot(3, 3, 4)
281 |     plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
282 |     plt.title("Optimal policy")
283 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
284 |         bottom=False, top=False, left=False, right=False, labelright=False)
285 |     plt.subplot(3, 3, 5)
286 |     plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
287 |     plt.title("MaxEnt policy")
288 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
289 |         bottom=False, top=False, left=False, right=False, labelright=False)
290 |     plt.subplot(3, 3, 6)
291 |     plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
292 |                vmin=0, vmax=3)
293 |     plt.title("DeepMaxEnt policy")
294 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
295 |         bottom=False, top=False, left=False, right=False, labelright=False)
296 | 
297 |     plt.subplot(3, 3, 7)
298 |     plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
299 |     plt.title("Optimal value")
300 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
301 |         bottom=False, top=False, left=False, right=False, labelright=False)
302 |     plt.subplot(3, 3, 8)
303 |     plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
304 |     plt.title("MaxEnt value")
305 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
306 |         bottom=False, top=False, left=False, right=False, labelright=False)
307 |     plt.subplot(3, 3, 9)
308 |     plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
309 |     plt.title("DeepMaxEnt value")
310 |     plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
311 |         bottom=False, top=False, left=False, right=False, labelright=False)
312 |     plt.savefig("{}_{}_{}_{}_{}_{}_{}_{}_{}_objectworld_{}.png".format(
313 |         grid_size, n_objects, n_colours, discrete, n_samples, epochs, structure,
314 |         l1, l2, np.random.randint(10000000)))
315 | 
316 |     return maxent_EVD, deep_maxent_EVD
317 | 
318 | def test_gw_over_samples(grid_size, feature_map, epochs, structure, n):
319 |     """
320 |     Test MaxEnt and DeepMaxEnt on a gridworld of size grid_size with the feature
321 |     map feature_map with different numbers of paths.
322 | 
323 |     grid_size: Grid size. int.
324 |     feature_map: Which feature map to use. String in {ident, coord, proxi}.
325 |     epochs: MaxEnt iterations. int.
326 |     structure: Neural network structure tuple, e.g. (3, 3) would be a
327 |         3-layer neural network with assumed inputs.
328 |     n: Iterations. int.
329 |     -> (MaxEnt [(n_samples, mean expected value difference, stdev)],
330 |         DeepMaxEnt [(n_samples, mean expected value difference, stdev)]),
331 |        raw data (maxent_data, deep_maxent_data)
332 |     """
333 | 
334 |     maxent_data = []
335 |     deep_maxent_data = []
336 |     for n_samples in (32,):
337 |         t = time()
338 |         maxent_EVDs = []
339 |         deep_maxent_EVDs = []
340 |         for i in range(n):
341 |             print("{}: {}/{}".format(n_samples, i+1, n))
342 |             maxent_EVD, deep_maxent_EVD = test_gw_once(grid_size, feature_map,
343 |                                                        n_samples, epochs,
344 |                                                        structure)
345 |             maxent_EVDs.append(maxent_EVD)
346 |             deep_maxent_EVDs.append(deep_maxent_EVD)
347 |             print(maxent_EVD, deep_maxent_EVD)
348 |             stdout.flush()
349 |         maxent_data.append((n_samples, np.mean(maxent_EVDs),
350 |                            np.std(maxent_EVDs)))
351 |         deep_maxent_data.append((n_samples, np.mean(deep_maxent_EVDs),
352 |                                 np.std(deep_maxent_EVDs)))
353 |         print("{} (took {:.02}s)".format(n_samples, time() - t))
354 |         print("MaxEnt:", maxent_data)
355 |         print("DeepMaxEnt:", deep_maxent_data)
356 |     return maxent_data, deep_maxent_data
357 | 
358 | def test_ow_over_samples(grid_size, n_objects, n_colours, discrete, l1, l2,
359 |                          epochs, structure, n):
360 |     """
361 |     Test MaxEnt and DeepMaxEnt on an objectworld with different numbers of paths.
362 | 
363 |     grid_size: Grid size. int.
364 |     n_objects: Number of objects. int.
365 |     n_colours: Number of colours. int.
366 |     discrete: Whether the features should be discrete. bool.
367 |     feature_map: Which feature map to use. String in {ident, coord, proxi}.
368 |     l1: L1 regularisation. float.
369 |     l2: L2 regularisation. float.
370 |     epochs: MaxEnt iterations. int.
371 |     structure: Neural network structure tuple, e.g. (3, 3) would be a
372 |         3-layer neural network with assumed inputs.
373 |     n: Iterations. int.
374 |     -> (MaxEnt [(n_samples, mean expected value difference, stdev)],
375 |         DeepMaxEnt [(n_samples, mean expected value difference, stdev)]),
376 |        raw data (maxent_data, deep_maxent_data)
377 |     """
378 | 
379 |     maxent_data = []
380 |     deep_maxent_data = []
381 |     for n_samples in (32, 16, 8, 4):
382 |         t = time()
383 |         maxent_EVDs = []
384 |         deep_maxent_EVDs = []
385 |         for i in range(n):
386 |             print("{}: {}/{}".format(n_samples, i+1, n))
387 |             maxent_EVD, deep_maxent_EVD = test_ow_once(grid_size, n_objects,
388 |                 n_colours, discrete, l1, l2, n_samples, epochs, structure)
389 |             maxent_EVDs.append(maxent_EVD)
390 |             deep_maxent_EVDs.append(deep_maxent_EVD)
391 |             print(maxent_EVD, deep_maxent_EVD)
392 |             stdout.flush()
393 |         maxent_data.append((n_samples, np.mean(maxent_EVDs),
394 |             np.median(maxent_EVDs), np.std(maxent_EVDs)))
395 |         deep_maxent_data.append((n_samples, np.mean(deep_maxent_EVDs),
396 |             np.median(deep_maxent_EVDs), np.std(deep_maxent_EVDs)))
397 |         print("{} (took {:.02}s)".format(n_samples, time() - t))
398 |         print("MaxEnt:", maxent_data)
399 |         print("DeepMaxEnt:", deep_maxent_data)
400 |     return maxent_data, deep_maxent_data
401 | 
402 | if __name__ == '__main__':
403 |     # Tests the 16 x 16 objectworld.
404 |     print(test_ow_over_samples(16, 25, 2, False, 0, 0, 150, (3, 3), 10))
405 |     # Tests the 32 x 32 objectworld.
406 |     print(test_ow_over_samples(32, 50, 2, False, 0, 0, 250, (3, 3), 5))


--------------------------------------------------------------------------------