├── LICENSE.txt ├── README.md ├── gridworld.py ├── qlearn.py └── rl.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Kevin Hanselman, Carl Ericson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This is the code for [this](https://www.youtube.com/watch?v=PYQAI6Td2wo) video on Youtube by Siraj Raval on Sensor Networks. We can imagine sensors as a grid, then use the bellman equation to compute the optimal policy to get from Router A to Router B as efficinetly as possible. Implementations of MDP value iteration, MDP policy iteration, and Q-Learning in a toy grid-world setting. 4 | 5 | ## Dependencies 6 | 7 | * matplotlib 8 | * OpenCV 9 | * numpy 10 | 11 | Install missing dependencies using [pip](https://pypi.org/project/pip/) 12 | 13 | ## Usage 14 | 15 | Run 'python RL.py' in terminal to run the code. 16 | 17 | ## Credits 18 | 19 | Credits for this code go to [kevlar1818](https://github.com/kevlar1818/grid-world-rl). I've merely created a wrapper to get people started. 20 | 21 | -------------------------------------------------------------------------------- /gridworld.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import cv2 4 | 5 | 6 | class GridWorldMDP: 7 | 8 | # up, right, down, left 9 | _direction_deltas = [ 10 | (-1, 0), 11 | (0, 1), 12 | (1, 0), 13 | (0, -1), 14 | ] 15 | _num_actions = len(_direction_deltas) 16 | 17 | def __init__(self, 18 | reward_grid, 19 | terminal_mask, 20 | obstacle_mask, 21 | action_probabilities, 22 | no_action_probability): 23 | 24 | self._reward_grid = reward_grid 25 | self._terminal_mask = terminal_mask 26 | self._obstacle_mask = obstacle_mask 27 | self._T = self._create_transition_matrix( 28 | action_probabilities, 29 | no_action_probability, 30 | obstacle_mask 31 | ) 32 | 33 | @property 34 | def shape(self): 35 | return self._reward_grid.shape 36 | 37 | @property 38 | def size(self): 39 | return self._reward_grid.size 40 | 41 | @property 42 | def reward_grid(self): 43 | return self._reward_grid 44 | 45 | def run_value_iterations(self, discount=1.0, 46 | iterations=10): 47 | utility_grids, policy_grids = self._init_utility_policy_storage(iterations) 48 | 49 | utility_grid = np.zeros_like(self._reward_grid) 50 | for i in range(iterations): 51 | utility_grid = self._value_iteration(utility_grid=utility_grid) 52 | policy_grids[:, :, i] = self.best_policy(utility_grid) 53 | utility_grids[:, :, i] = utility_grid 54 | return policy_grids, utility_grids 55 | 56 | def run_policy_iterations(self, discount=1.0, 57 | iterations=10): 58 | utility_grids, policy_grids = self._init_utility_policy_storage(iterations) 59 | 60 | policy_grid = np.random.randint(0, self._num_actions, 61 | self.shape) 62 | utility_grid = self._reward_grid.copy() 63 | 64 | for i in range(iterations): 65 | policy_grid, utility_grid = self._policy_iteration( 66 | policy_grid=policy_grid, 67 | utility_grid=utility_grid 68 | ) 69 | policy_grids[:, :, i] = policy_grid 70 | utility_grids[:, :, i] = utility_grid 71 | return policy_grids, utility_grids 72 | 73 | def generate_experience(self, current_state_idx, action_idx): 74 | sr, sc = self.grid_indices_to_coordinates(current_state_idx) 75 | next_state_probs = self._T[sr, sc, action_idx, :, :].flatten() 76 | 77 | next_state_idx = np.random.choice(np.arange(next_state_probs.size), 78 | p=next_state_probs) 79 | 80 | return (next_state_idx, 81 | self._reward_grid.flatten()[next_state_idx], 82 | self._terminal_mask.flatten()[next_state_idx]) 83 | 84 | def grid_indices_to_coordinates(self, indices=None): 85 | if indices is None: 86 | indices = np.arange(self.size) 87 | return np.unravel_index(indices, self.shape) 88 | 89 | def grid_coordinates_to_indices(self, coordinates=None): 90 | # Annoyingly, this doesn't work for negative indices. 91 | # The mode='wrap' parameter only works on positive indices. 92 | if coordinates is None: 93 | return np.arange(self.size) 94 | return np.ravel_multi_index(coordinates, self.shape) 95 | 96 | def best_policy(self, utility_grid): 97 | M, N = self.shape 98 | return np.argmax((utility_grid.reshape((1, 1, 1, M, N)) * self._T) 99 | .sum(axis=-1).sum(axis=-1), axis=2) 100 | 101 | def _init_utility_policy_storage(self, depth): 102 | M, N = self.shape 103 | utility_grids = np.zeros((M, N, depth)) 104 | policy_grids = np.zeros_like(utility_grids) 105 | return utility_grids, policy_grids 106 | 107 | def _create_transition_matrix(self, 108 | action_probabilities, 109 | no_action_probability, 110 | obstacle_mask): 111 | M, N = self.shape 112 | 113 | T = np.zeros((M, N, self._num_actions, M, N)) 114 | 115 | r0, c0 = self.grid_indices_to_coordinates() 116 | 117 | T[r0, c0, :, r0, c0] += no_action_probability 118 | 119 | for action in range(self._num_actions): 120 | for offset, P in action_probabilities: 121 | direction = (action + offset) % self._num_actions 122 | 123 | dr, dc = self._direction_deltas[direction] 124 | r1 = np.clip(r0 + dr, 0, M - 1) 125 | c1 = np.clip(c0 + dc, 0, N - 1) 126 | 127 | temp_mask = obstacle_mask[r1, c1].flatten() 128 | r1[temp_mask] = r0[temp_mask] 129 | c1[temp_mask] = c0[temp_mask] 130 | 131 | T[r0, c0, action, r1, c1] += P 132 | 133 | terminal_locs = np.where(self._terminal_mask.flatten())[0] 134 | T[r0[terminal_locs], c0[terminal_locs], :, :, :] = 0 135 | return T 136 | 137 | def _value_iteration(self, utility_grid, discount=1.0): 138 | out = np.zeros_like(utility_grid) 139 | M, N = self.shape 140 | for i in range(M): 141 | for j in range(N): 142 | out[i, j] = self._calculate_utility((i, j), 143 | discount, 144 | utility_grid) 145 | return out 146 | 147 | def _policy_iteration(self, *, utility_grid, 148 | policy_grid, discount=1.0): 149 | r, c = self.grid_indices_to_coordinates() 150 | 151 | M, N = self.shape 152 | 153 | utility_grid = ( 154 | self._reward_grid + 155 | discount * ((utility_grid.reshape((1, 1, 1, M, N)) * self._T) 156 | .sum(axis=-1).sum(axis=-1))[r, c, policy_grid.flatten()] 157 | .reshape(self.shape) 158 | ) 159 | 160 | utility_grid[self._terminal_mask] = self._reward_grid[self._terminal_mask] 161 | 162 | return self.best_policy(utility_grid), utility_grid 163 | 164 | def _calculate_utility(self, loc, discount, utility_grid): 165 | if self._terminal_mask[loc]: 166 | return self._reward_grid[loc] 167 | row, col = loc 168 | return np.max( 169 | discount * np.sum( 170 | np.sum(self._T[row, col, :, :, :] * utility_grid, 171 | axis=-1), 172 | axis=-1) 173 | ) + self._reward_grid[loc] 174 | 175 | def plot_policy(self, utility_grid, policy_grid=None): 176 | if policy_grid is None: 177 | policy_grid = self.best_policy(utility_grid) 178 | markers = "^>v<" 179 | marker_size = 200 // np.max(policy_grid.shape) 180 | marker_edge_width = marker_size // 10 181 | marker_fill_color = 'w' 182 | 183 | no_action_mask = self._terminal_mask | self._obstacle_mask 184 | 185 | utility_normalized = (utility_grid - utility_grid.min()) / \ 186 | (utility_grid.max() - utility_grid.min()) 187 | 188 | utility_normalized = (255*utility_normalized).astype(np.uint8) 189 | 190 | utility_rgb = cv2.applyColorMap(utility_normalized, cv2.COLORMAP_JET) 191 | for i in range(3): 192 | channel = utility_rgb[:, :, i] 193 | channel[self._obstacle_mask] = 0 194 | 195 | plt.imshow(utility_rgb[:, :, ::-1], interpolation='none') 196 | 197 | for i, marker in enumerate(markers): 198 | y, x = np.where((policy_grid == i) & np.logical_not(no_action_mask)) 199 | plt.plot(x, y, marker, ms=marker_size, mew=marker_edge_width, 200 | color=marker_fill_color) 201 | 202 | y, x = np.where(self._terminal_mask) 203 | plt.plot(x, y, 'o', ms=marker_size, mew=marker_edge_width, 204 | color=marker_fill_color) 205 | 206 | tick_step_options = np.array([1, 2, 5, 10, 20, 50, 100]) 207 | tick_step = np.max(policy_grid.shape)/8 208 | best_option = np.argmin(np.abs(np.log(tick_step) - np.log(tick_step_options))) 209 | tick_step = tick_step_options[best_option] 210 | plt.xticks(np.arange(0, policy_grid.shape[1] - 0.5, tick_step)) 211 | plt.yticks(np.arange(0, policy_grid.shape[0] - 0.5, tick_step)) 212 | plt.xlim([-0.5, policy_grid.shape[0]-0.5]) 213 | plt.xlim([-0.5, policy_grid.shape[1]-0.5]) 214 | -------------------------------------------------------------------------------- /qlearn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random as rand 3 | 4 | 5 | class QLearner: 6 | '''A generic implementation of Q-Learning and Dyna-Q''' 7 | 8 | def __init__(self, *, 9 | num_states, 10 | num_actions, 11 | learning_rate, 12 | discount_rate=1.0, 13 | random_action_prob=0.5, 14 | random_action_decay_rate=0.99, 15 | dyna_iterations=0): 16 | 17 | self._num_states = num_states 18 | self._num_actions = num_actions 19 | self._learning_rate = learning_rate 20 | self._discount_rate = discount_rate 21 | self._random_action_prob = random_action_prob 22 | self._random_action_decay_rate = random_action_decay_rate 23 | self._dyna_iterations = dyna_iterations 24 | 25 | self._experiences = [] 26 | 27 | # Initialize Q to small random values. 28 | self._Q = np.zeros((num_states, num_actions), dtype=np.float) 29 | self._Q += np.random.normal(0, 0.3, self._Q.shape) 30 | 31 | def initialize(self, state): 32 | '''Set the initial state and return the learner's first action''' 33 | self._decide_next_action(state) 34 | self._stored_state = state 35 | return self._stored_action 36 | 37 | def learn(self, initial_state, experience_func, iterations=100): 38 | '''Iteratively experience new states and rewards''' 39 | all_policies = np.zeros((self._num_states, iterations)) 40 | all_utilities = np.zeros_like(all_policies) 41 | for i in range(iterations): 42 | done = False 43 | self.initialize(initial_state) 44 | for j in range(iterations): 45 | state, reward, done = experience_func(self._stored_state, 46 | self._stored_action) 47 | self.experience(state, reward) 48 | if done: 49 | break 50 | 51 | policy, utility = self.get_policy_and_utility() 52 | all_policies[:, i] = policy 53 | all_utilities[:, i] = utility 54 | return all_policies, all_utilities 55 | 56 | def experience(self, state, reward): 57 | '''The learner experiences state and receives a reward''' 58 | self._update_Q(self._stored_state, self._stored_action, state, reward) 59 | 60 | if self._dyna_iterations > 0: 61 | self._experiences.append( 62 | (self._stored_state, self._stored_action, state, reward) 63 | ) 64 | exp_idx = np.random.choice(len(self._experiences), 65 | self._dyna_iterations) 66 | for i in exp_idx: 67 | self._update_Q(*self._experiences[i]) 68 | 69 | # determine an action and update the current state 70 | self._decide_next_action(state) 71 | self._stored_state = state 72 | 73 | self._random_action_prob *= self._random_action_decay_rate 74 | 75 | return self._stored_action 76 | 77 | def get_policy_and_utility(self): 78 | policy = np.argmax(self._Q, axis=1) 79 | utility = np.max(self._Q, axis=1) 80 | return policy, utility 81 | 82 | def _update_Q(self, s, a, s_prime, r): 83 | best_reward = self._Q[s_prime, self._find_best_action(s_prime)] 84 | self._Q[s, a] *= (1 - self._learning_rate) 85 | self._Q[s, a] += (self._learning_rate 86 | * (r + self._discount_rate * best_reward)) 87 | 88 | def _decide_next_action(self, state): 89 | if rand.random() <= self._random_action_prob: 90 | self._stored_action = rand.randint(0, self._num_actions - 1) 91 | else: 92 | self._stored_action = self._find_best_action(state) 93 | 94 | def _find_best_action(self, state): 95 | return int(np.argmax(self._Q[state, :])) 96 | -------------------------------------------------------------------------------- /rl.py: -------------------------------------------------------------------------------- 1 | from gridworld import GridWorldMDP 2 | from qlearn import QLearner 3 | 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def plot_convergence(utility_grids, policy_grids): 9 | fig, ax1 = plt.subplots() 10 | ax2 = ax1.twinx() 11 | utility_ssd = np.sum(np.square(np.diff(utility_grids)), axis=(0, 1)) 12 | ax1.plot(utility_ssd, 'b.-') 13 | ax1.set_ylabel('Change in Utility', color='b') 14 | 15 | policy_changes = np.count_nonzero(np.diff(policy_grids), axis=(0, 1)) 16 | ax2.plot(policy_changes, 'r.-') 17 | ax2.set_ylabel('Change in Best Policy', color='r') 18 | 19 | 20 | if __name__ == '__main__': 21 | shape = (3, 4) 22 | goal = (0, -1) 23 | trap = (1, -1) 24 | obstacle = (1, 1) 25 | start = (2, 0) 26 | default_reward = -0.1 27 | goal_reward = 1 28 | trap_reward = -1 29 | 30 | reward_grid = np.zeros(shape) + default_reward 31 | reward_grid[goal] = goal_reward 32 | reward_grid[trap] = trap_reward 33 | reward_grid[obstacle] = 0 34 | 35 | terminal_mask = np.zeros_like(reward_grid, dtype=np.bool) 36 | terminal_mask[goal] = True 37 | terminal_mask[trap] = True 38 | 39 | obstacle_mask = np.zeros_like(reward_grid, dtype=np.bool) 40 | obstacle_mask[1, 1] = True 41 | 42 | gw = GridWorldMDP(reward_grid=reward_grid, 43 | obstacle_mask=obstacle_mask, 44 | terminal_mask=terminal_mask, 45 | action_probabilities=[ 46 | (-1, 0.1), 47 | (0, 0.8), 48 | (1, 0.1), 49 | ], 50 | no_action_probability=0.0) 51 | 52 | mdp_solvers = {'Value Iteration': gw.run_value_iterations, 53 | 'Policy Iteration': gw.run_policy_iterations} 54 | 55 | for solver_name, solver_fn in mdp_solvers.items(): 56 | print('Final result of {}:'.format(solver_name)) 57 | policy_grids, utility_grids = solver_fn(iterations=25, discount=0.5) 58 | print(policy_grids[:, :, -1]) 59 | print(utility_grids[:, :, -1]) 60 | plt.figure() 61 | gw.plot_policy(utility_grids[:, :, -1]) 62 | plot_convergence(utility_grids, policy_grids) 63 | plt.show() 64 | 65 | ql = QLearner(num_states=(shape[0] * shape[1]), 66 | num_actions=4, 67 | learning_rate=0.8, 68 | discount_rate=0.9, 69 | random_action_prob=0.5, 70 | random_action_decay_rate=0.99, 71 | dyna_iterations=0) 72 | 73 | start_state = gw.grid_coordinates_to_indices(start) 74 | 75 | iterations = 1000 76 | flat_policies, flat_utilities = ql.learn(start_state, 77 | gw.generate_experience, 78 | iterations=iterations) 79 | 80 | new_shape = (gw.shape[0], gw.shape[1], iterations) 81 | ql_utility_grids = flat_utilities.reshape(new_shape) 82 | ql_policy_grids = flat_policies.reshape(new_shape) 83 | print('Final result of QLearning:') 84 | print(ql_policy_grids[:, :, -1]) 85 | print(ql_utility_grids[:, :, -1]) 86 | 87 | plt.figure() 88 | gw.plot_policy(ql_utility_grids[:, :, -1], ql_policy_grids[:, :, -1]) 89 | plot_convergence(ql_utility_grids, ql_policy_grids) 90 | plt.show() 91 | --------------------------------------------------------------------------------