├── LICENSE.txt
├── README.md
├── gridworld.py
├── qlearn.py
└── rl.py


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Kevin Hanselman, Carl Ericson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | This is the code for [this](https://www.youtube.com/watch?v=PYQAI6Td2wo) video on Youtube by Siraj Raval on Sensor Networks. We can imagine sensors as a grid, then use the bellman equation to compute the optimal policy to get from Router A to Router B as efficinetly as possible. Implementations of MDP value iteration, MDP policy iteration, and Q-Learning in a toy grid-world setting.
 4 | 
 5 | ## Dependencies 
 6 | 
 7 | * matplotlib
 8 | * OpenCV
 9 | * numpy
10 | 
11 | Install missing dependencies using [pip](https://pypi.org/project/pip/) 
12 | 
13 | ## Usage
14 | 
15 | Run 'python RL.py' in terminal to run the code.
16 | 
17 | ## Credits 
18 | 
19 | Credits for this code go to [kevlar1818](https://github.com/kevlar1818/grid-world-rl). I've merely created a wrapper to get people started. 
20 | 
21 | 


--------------------------------------------------------------------------------
/gridworld.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import cv2
  4 | 
  5 | 
  6 | class GridWorldMDP:
  7 | 
  8 |     # up, right, down, left
  9 |     _direction_deltas = [
 10 |         (-1, 0),
 11 |         (0, 1),
 12 |         (1, 0),
 13 |         (0, -1),
 14 |     ]
 15 |     _num_actions = len(_direction_deltas)
 16 | 
 17 |     def __init__(self,
 18 |                  reward_grid,
 19 |                  terminal_mask,
 20 |                  obstacle_mask,
 21 |                  action_probabilities,
 22 |                  no_action_probability):
 23 | 
 24 |         self._reward_grid = reward_grid
 25 |         self._terminal_mask = terminal_mask
 26 |         self._obstacle_mask = obstacle_mask
 27 |         self._T = self._create_transition_matrix(
 28 |             action_probabilities,
 29 |             no_action_probability,
 30 |             obstacle_mask
 31 |         )
 32 | 
 33 |     @property
 34 |     def shape(self):
 35 |         return self._reward_grid.shape
 36 | 
 37 |     @property
 38 |     def size(self):
 39 |         return self._reward_grid.size
 40 | 
 41 |     @property
 42 |     def reward_grid(self):
 43 |         return self._reward_grid
 44 | 
 45 |     def run_value_iterations(self, discount=1.0,
 46 |                              iterations=10):
 47 |         utility_grids, policy_grids = self._init_utility_policy_storage(iterations)
 48 | 
 49 |         utility_grid = np.zeros_like(self._reward_grid)
 50 |         for i in range(iterations):
 51 |             utility_grid = self._value_iteration(utility_grid=utility_grid)
 52 |             policy_grids[:, :, i] = self.best_policy(utility_grid)
 53 |             utility_grids[:, :, i] = utility_grid
 54 |         return policy_grids, utility_grids
 55 | 
 56 |     def run_policy_iterations(self, discount=1.0,
 57 |                               iterations=10):
 58 |         utility_grids, policy_grids = self._init_utility_policy_storage(iterations)
 59 | 
 60 |         policy_grid = np.random.randint(0, self._num_actions,
 61 |                                         self.shape)
 62 |         utility_grid = self._reward_grid.copy()
 63 | 
 64 |         for i in range(iterations):
 65 |             policy_grid, utility_grid = self._policy_iteration(
 66 |                 policy_grid=policy_grid,
 67 |                 utility_grid=utility_grid
 68 |             )
 69 |             policy_grids[:, :, i] = policy_grid
 70 |             utility_grids[:, :, i] = utility_grid
 71 |         return policy_grids, utility_grids
 72 | 
 73 |     def generate_experience(self, current_state_idx, action_idx):
 74 |         sr, sc = self.grid_indices_to_coordinates(current_state_idx)
 75 |         next_state_probs = self._T[sr, sc, action_idx, :, :].flatten()
 76 | 
 77 |         next_state_idx = np.random.choice(np.arange(next_state_probs.size),
 78 |                                           p=next_state_probs)
 79 | 
 80 |         return (next_state_idx,
 81 |                 self._reward_grid.flatten()[next_state_idx],
 82 |                 self._terminal_mask.flatten()[next_state_idx])
 83 | 
 84 |     def grid_indices_to_coordinates(self, indices=None):
 85 |         if indices is None:
 86 |             indices = np.arange(self.size)
 87 |         return np.unravel_index(indices, self.shape)
 88 | 
 89 |     def grid_coordinates_to_indices(self, coordinates=None):
 90 |         # Annoyingly, this doesn't work for negative indices.
 91 |         # The mode='wrap' parameter only works on positive indices.
 92 |         if coordinates is None:
 93 |             return np.arange(self.size)
 94 |         return np.ravel_multi_index(coordinates, self.shape)
 95 | 
 96 |     def best_policy(self, utility_grid):
 97 |         M, N = self.shape
 98 |         return np.argmax((utility_grid.reshape((1, 1, 1, M, N)) * self._T)
 99 |                          .sum(axis=-1).sum(axis=-1), axis=2)
100 | 
101 |     def _init_utility_policy_storage(self, depth):
102 |         M, N = self.shape
103 |         utility_grids = np.zeros((M, N, depth))
104 |         policy_grids = np.zeros_like(utility_grids)
105 |         return utility_grids, policy_grids
106 | 
107 |     def _create_transition_matrix(self,
108 |                                   action_probabilities,
109 |                                   no_action_probability,
110 |                                   obstacle_mask):
111 |         M, N = self.shape
112 | 
113 |         T = np.zeros((M, N, self._num_actions, M, N))
114 | 
115 |         r0, c0 = self.grid_indices_to_coordinates()
116 | 
117 |         T[r0, c0, :, r0, c0] += no_action_probability
118 | 
119 |         for action in range(self._num_actions):
120 |             for offset, P in action_probabilities:
121 |                 direction = (action + offset) % self._num_actions
122 | 
123 |                 dr, dc = self._direction_deltas[direction]
124 |                 r1 = np.clip(r0 + dr, 0, M - 1)
125 |                 c1 = np.clip(c0 + dc, 0, N - 1)
126 | 
127 |                 temp_mask = obstacle_mask[r1, c1].flatten()
128 |                 r1[temp_mask] = r0[temp_mask]
129 |                 c1[temp_mask] = c0[temp_mask]
130 | 
131 |                 T[r0, c0, action, r1, c1] += P
132 | 
133 |         terminal_locs = np.where(self._terminal_mask.flatten())[0]
134 |         T[r0[terminal_locs], c0[terminal_locs], :, :, :] = 0
135 |         return T
136 | 
137 |     def _value_iteration(self, utility_grid, discount=1.0):
138 |         out = np.zeros_like(utility_grid)
139 |         M, N = self.shape
140 |         for i in range(M):
141 |             for j in range(N):
142 |                 out[i, j] = self._calculate_utility((i, j),
143 |                                                     discount,
144 |                                                     utility_grid)
145 |         return out
146 | 
147 |     def _policy_iteration(self, *, utility_grid,
148 |                           policy_grid, discount=1.0):
149 |         r, c = self.grid_indices_to_coordinates()
150 | 
151 |         M, N = self.shape
152 | 
153 |         utility_grid = (
154 |             self._reward_grid +
155 |             discount * ((utility_grid.reshape((1, 1, 1, M, N)) * self._T)
156 |                         .sum(axis=-1).sum(axis=-1))[r, c, policy_grid.flatten()]
157 |             .reshape(self.shape)
158 |         )
159 | 
160 |         utility_grid[self._terminal_mask] = self._reward_grid[self._terminal_mask]
161 | 
162 |         return self.best_policy(utility_grid), utility_grid
163 | 
164 |     def _calculate_utility(self, loc, discount, utility_grid):
165 |         if self._terminal_mask[loc]:
166 |             return self._reward_grid[loc]
167 |         row, col = loc
168 |         return np.max(
169 |             discount * np.sum(
170 |                 np.sum(self._T[row, col, :, :, :] * utility_grid,
171 |                        axis=-1),
172 |                 axis=-1)
173 |         ) + self._reward_grid[loc]
174 | 
175 |     def plot_policy(self, utility_grid, policy_grid=None):
176 |         if policy_grid is None:
177 |             policy_grid = self.best_policy(utility_grid)
178 |         markers = "^>v<"
179 |         marker_size = 200 // np.max(policy_grid.shape)
180 |         marker_edge_width = marker_size // 10
181 |         marker_fill_color = 'w'
182 | 
183 |         no_action_mask = self._terminal_mask | self._obstacle_mask
184 | 
185 |         utility_normalized = (utility_grid - utility_grid.min()) / \
186 |                              (utility_grid.max() - utility_grid.min())
187 | 
188 |         utility_normalized = (255*utility_normalized).astype(np.uint8)
189 | 
190 |         utility_rgb = cv2.applyColorMap(utility_normalized, cv2.COLORMAP_JET)
191 |         for i in range(3):
192 |             channel = utility_rgb[:, :, i]
193 |             channel[self._obstacle_mask] = 0
194 | 
195 |         plt.imshow(utility_rgb[:, :, ::-1], interpolation='none')
196 | 
197 |         for i, marker in enumerate(markers):
198 |             y, x = np.where((policy_grid == i) & np.logical_not(no_action_mask))
199 |             plt.plot(x, y, marker, ms=marker_size, mew=marker_edge_width,
200 |                      color=marker_fill_color)
201 | 
202 |         y, x = np.where(self._terminal_mask)
203 |         plt.plot(x, y, 'o', ms=marker_size, mew=marker_edge_width,
204 |                  color=marker_fill_color)
205 | 
206 |         tick_step_options = np.array([1, 2, 5, 10, 20, 50, 100])
207 |         tick_step = np.max(policy_grid.shape)/8
208 |         best_option = np.argmin(np.abs(np.log(tick_step) - np.log(tick_step_options)))
209 |         tick_step = tick_step_options[best_option]
210 |         plt.xticks(np.arange(0, policy_grid.shape[1] - 0.5, tick_step))
211 |         plt.yticks(np.arange(0, policy_grid.shape[0] - 0.5, tick_step))
212 |         plt.xlim([-0.5, policy_grid.shape[0]-0.5])
213 |         plt.xlim([-0.5, policy_grid.shape[1]-0.5])
214 | 


--------------------------------------------------------------------------------
/qlearn.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random as rand
 3 | 
 4 | 
 5 | class QLearner:
 6 |     '''A generic implementation of Q-Learning and Dyna-Q'''
 7 | 
 8 |     def __init__(self, *,
 9 |                  num_states,
10 |                  num_actions,
11 |                  learning_rate,
12 |                  discount_rate=1.0,
13 |                  random_action_prob=0.5,
14 |                  random_action_decay_rate=0.99,
15 |                  dyna_iterations=0):
16 | 
17 |         self._num_states = num_states
18 |         self._num_actions = num_actions
19 |         self._learning_rate = learning_rate
20 |         self._discount_rate = discount_rate
21 |         self._random_action_prob = random_action_prob
22 |         self._random_action_decay_rate = random_action_decay_rate
23 |         self._dyna_iterations = dyna_iterations
24 | 
25 |         self._experiences = []
26 | 
27 |         # Initialize Q to small random values.
28 |         self._Q = np.zeros((num_states, num_actions), dtype=np.float)
29 |         self._Q += np.random.normal(0, 0.3, self._Q.shape)
30 | 
31 |     def initialize(self, state):
32 |         '''Set the initial state and return the learner's first action'''
33 |         self._decide_next_action(state)
34 |         self._stored_state = state
35 |         return self._stored_action
36 | 
37 |     def learn(self, initial_state, experience_func, iterations=100):
38 |         '''Iteratively experience new states and rewards'''
39 |         all_policies = np.zeros((self._num_states, iterations))
40 |         all_utilities = np.zeros_like(all_policies)
41 |         for i in range(iterations):
42 |             done = False
43 |             self.initialize(initial_state)
44 |             for j in range(iterations):
45 |                 state, reward, done = experience_func(self._stored_state,
46 |                                                       self._stored_action)
47 |                 self.experience(state, reward)
48 |                 if done:
49 |                     break
50 | 
51 |             policy, utility = self.get_policy_and_utility()
52 |             all_policies[:, i] = policy
53 |             all_utilities[:, i] = utility
54 |         return all_policies, all_utilities
55 | 
56 |     def experience(self, state, reward):
57 |         '''The learner experiences state and receives a reward'''
58 |         self._update_Q(self._stored_state, self._stored_action, state, reward)
59 | 
60 |         if self._dyna_iterations > 0:
61 |             self._experiences.append(
62 |                 (self._stored_state, self._stored_action, state, reward)
63 |             )
64 |             exp_idx = np.random.choice(len(self._experiences),
65 |                                        self._dyna_iterations)
66 |             for i in exp_idx:
67 |                 self._update_Q(*self._experiences[i])
68 | 
69 |         # determine an action and update the current state
70 |         self._decide_next_action(state)
71 |         self._stored_state = state
72 | 
73 |         self._random_action_prob *= self._random_action_decay_rate
74 | 
75 |         return self._stored_action
76 | 
77 |     def get_policy_and_utility(self):
78 |         policy = np.argmax(self._Q, axis=1)
79 |         utility = np.max(self._Q, axis=1)
80 |         return policy, utility
81 | 
82 |     def _update_Q(self, s, a, s_prime, r):
83 |         best_reward = self._Q[s_prime, self._find_best_action(s_prime)]
84 |         self._Q[s, a] *= (1 - self._learning_rate)
85 |         self._Q[s, a] += (self._learning_rate
86 |                           * (r + self._discount_rate * best_reward))
87 | 
88 |     def _decide_next_action(self, state):
89 |         if rand.random() <= self._random_action_prob:
90 |             self._stored_action = rand.randint(0, self._num_actions - 1)
91 |         else:
92 |             self._stored_action = self._find_best_action(state)
93 | 
94 |     def _find_best_action(self, state):
95 |         return int(np.argmax(self._Q[state, :]))
96 | 


--------------------------------------------------------------------------------
/rl.py:
--------------------------------------------------------------------------------
 1 | from gridworld import GridWorldMDP
 2 | from qlearn import QLearner
 3 | 
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | 
 8 | def plot_convergence(utility_grids, policy_grids):
 9 |     fig, ax1 = plt.subplots()
10 |     ax2 = ax1.twinx()
11 |     utility_ssd = np.sum(np.square(np.diff(utility_grids)), axis=(0, 1))
12 |     ax1.plot(utility_ssd, 'b.-')
13 |     ax1.set_ylabel('Change in Utility', color='b')
14 | 
15 |     policy_changes = np.count_nonzero(np.diff(policy_grids), axis=(0, 1))
16 |     ax2.plot(policy_changes, 'r.-')
17 |     ax2.set_ylabel('Change in Best Policy', color='r')
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     shape = (3, 4)
22 |     goal = (0, -1)
23 |     trap = (1, -1)
24 |     obstacle = (1, 1)
25 |     start = (2, 0)
26 |     default_reward = -0.1
27 |     goal_reward = 1
28 |     trap_reward = -1
29 | 
30 |     reward_grid = np.zeros(shape) + default_reward
31 |     reward_grid[goal] = goal_reward
32 |     reward_grid[trap] = trap_reward
33 |     reward_grid[obstacle] = 0
34 | 
35 |     terminal_mask = np.zeros_like(reward_grid, dtype=np.bool)
36 |     terminal_mask[goal] = True
37 |     terminal_mask[trap] = True
38 | 
39 |     obstacle_mask = np.zeros_like(reward_grid, dtype=np.bool)
40 |     obstacle_mask[1, 1] = True
41 | 
42 |     gw = GridWorldMDP(reward_grid=reward_grid,
43 |                       obstacle_mask=obstacle_mask,
44 |                       terminal_mask=terminal_mask,
45 |                       action_probabilities=[
46 |                           (-1, 0.1),
47 |                           (0, 0.8),
48 |                           (1, 0.1),
49 |                       ],
50 |                       no_action_probability=0.0)
51 | 
52 |     mdp_solvers = {'Value Iteration': gw.run_value_iterations,
53 |                    'Policy Iteration': gw.run_policy_iterations}
54 | 
55 |     for solver_name, solver_fn in mdp_solvers.items():
56 |         print('Final result of {}:'.format(solver_name))
57 |         policy_grids, utility_grids = solver_fn(iterations=25, discount=0.5)
58 |         print(policy_grids[:, :, -1])
59 |         print(utility_grids[:, :, -1])
60 |         plt.figure()
61 |         gw.plot_policy(utility_grids[:, :, -1])
62 |         plot_convergence(utility_grids, policy_grids)
63 |         plt.show()
64 | 
65 |     ql = QLearner(num_states=(shape[0] * shape[1]),
66 |                   num_actions=4,
67 |                   learning_rate=0.8,
68 |                   discount_rate=0.9,
69 |                   random_action_prob=0.5,
70 |                   random_action_decay_rate=0.99,
71 |                   dyna_iterations=0)
72 | 
73 |     start_state = gw.grid_coordinates_to_indices(start)
74 | 
75 |     iterations = 1000
76 |     flat_policies, flat_utilities = ql.learn(start_state,
77 |                                              gw.generate_experience,
78 |                                              iterations=iterations)
79 | 
80 |     new_shape = (gw.shape[0], gw.shape[1], iterations)
81 |     ql_utility_grids = flat_utilities.reshape(new_shape)
82 |     ql_policy_grids = flat_policies.reshape(new_shape)
83 |     print('Final result of QLearning:')
84 |     print(ql_policy_grids[:, :, -1])
85 |     print(ql_utility_grids[:, :, -1])
86 | 
87 |     plt.figure()
88 |     gw.plot_policy(ql_utility_grids[:, :, -1], ql_policy_grids[:, :, -1])
89 |     plot_convergence(ql_utility_grids, ql_policy_grids)
90 |     plt.show()
91 | 


--------------------------------------------------------------------------------