├── .DS_Store ├── scripts ├── .DS_Store ├── main.py ├── maze_environment.py └── maze_solver.py ├── programming assignment adprl.pdf ├── gundogan_alperen_03694565_report.pdf ├── maze.txt └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gundoganalperen/adp_rl/HEAD/.DS_Store -------------------------------------------------------------------------------- /scripts/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gundoganalperen/adp_rl/HEAD/scripts/.DS_Store -------------------------------------------------------------------------------- /programming assignment adprl.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gundoganalperen/adp_rl/HEAD/programming assignment adprl.pdf -------------------------------------------------------------------------------- /gundogan_alperen_03694565_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gundoganalperen/adp_rl/HEAD/gundogan_alperen_03694565_report.pdf -------------------------------------------------------------------------------- /maze.txt: -------------------------------------------------------------------------------- 1 | # This is the definition of a maze 2 | # Lines starting with # must be ignored 3 | # 1: Wall 0: Free 4 | # S: Start G: Goal T: Trap 5 | # 6 | 1 1 1 1 1 1 1 1 1 1 7 | 1 0 0 0 0 0 1 0 0 1 8 | 1 0 1 1 1 0 0 0 0 1 9 | 1 0 1 T 1 0 1 0 S 1 10 | 1 0 0 0 0 0 1 0 0 1 11 | 1 0 1 1 1 1 1 1 1 1 12 | 1 0 0 0 0 0 0 0 G 1 13 | 1 1 1 1 1 1 1 1 1 1 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # adp_rl 2 | Approximate Dynamic Programming and Reinforcement Learning - Programming Assignment 3 | 4 | The purpose of this assignment is to implement a simple environment and learn to make optimal decisions inside a maze by solving the problem with Dynamic Programming. Value Iteration(VI) and Policy Iteration(PI) i.e. Policy Evaluation, Policy Improvement methods are implemented and analyzed. 5 | 6 | Run the `python main.py /absolute/path/to/maze.txt` command to launch the application. 7 | -------------------------------------------------------------------------------- /scripts/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Approximate Dynamic Programming & Reinforcement Learning - WS 2018 3 | Programming Assignment 4 | Alperen Gundogan 5 | 6 | 30.01.2019 7 | 8 | Command necessary to test the code. 9 | python main.py maze.txt 10 | """ 11 | from __future__ import division 12 | import sys 13 | import os 14 | from maze_environment import Maze_env 15 | from maze_solver import Maze_solver 16 | 17 | import matplotlib.pyplot as plt 18 | from matplotlib.pyplot import figure 19 | 20 | MAX_ITER = 1000 21 | 22 | if __name__ == '__main__': 23 | if len(sys.argv) < 2: 24 | print('Arguments: ', len(sys.argv)) 25 | sys.exit(1) 26 | script = sys.argv[0] 27 | input_file = sys.argv[1] 28 | 29 | maze_sol = Maze_solver() 30 | maze_sol.read_file(input_file) 31 | cost_functions = [1, 2] 32 | alpha = [0.99, 0.01, 0.3, 0.5, 0.7, 0.9] 33 | 34 | for g in cost_functions: 35 | 36 | maze_sol.maze_env.build_transition_probability_matrix(cost_function = g) 37 | for a in alpha: 38 | 39 | policy, v, it = maze_sol.policy_iteration(discount_factor = a, max_iteration=MAX_ITER) 40 | print("Policy Iteration: " + str(it) + " Cost function: " + str(g) + " Discount factor: " + str(a) ) 41 | text = "PI"+ ",g" + str(g) + ",a=" + str(a) 42 | maze_sol.plot_error(maze_sol.gt_PI[g-1], maze_sol.values, text) 43 | maze_sol.visulaze_results(v, policy, text) 44 | 45 | 46 | policy, v, it = maze_sol.value_iteration(discount_factor = a, max_iteration=MAX_ITER) 47 | print("Value Iteration: " + str(it) + " Cost function: " + str(g) + " Discount factor: " + str(a)) 48 | text_v = "VI"+ ",g" + str(g) + ",a=" + str(a) 49 | 50 | maze_sol.plot_error(maze_sol.gt_VI[g-1], maze_sol.values, text_v) 51 | maze_sol.visulaze_results(v, policy, text_v) 52 | 53 | plt.show() 54 | -------------------------------------------------------------------------------- /scripts/maze_environment.py: -------------------------------------------------------------------------------- 1 | """ 2 | Approximate Dynamic Programming & Reinforcement Learning - WS 2018 3 | Programming Assignment 4 | Alperen Gundogan 5 | 6 | 30.01.2019 7 | 8 | Command necessary to test the code. 9 | python main.py maze.txt 10 | """ 11 | import numpy as np 12 | 13 | UP = 0 14 | RIGHT = 1 15 | DOWN = 2 16 | LEFT = 3 17 | IDLE = 4 18 | 19 | # Simulation parameters. 20 | # define the cost as minus and maximize the value function. 21 | COST_TRAP = 50 22 | COST_ENERGY = 1 23 | GOAL_REACHED = -1 24 | # This is the probability of reaching adjacent states when the action is executed. 25 | PRO = 0.1 26 | SKIP_GOAL_STATE = 0 27 | 28 | class Maze_env(object): 29 | 30 | def __init__(self): 31 | # Matrix of lists to store the maze. 32 | self.maze = [] 33 | # Stores the maze with iteration numbers e.g. [1 2 3 ..\n 11 12 ... 79]. 34 | self.grid = [] 35 | # Stores the maze as a vector form. 36 | self.maze_vec = [] 37 | # Stores the dimensions of maze 38 | self.shape = [] 39 | # number of states 40 | self.nS = 0 41 | # number of actions = {UP, RIGHT, DOWN, LEFT, IDLE}. 42 | self.nA = 5 43 | # Maximum x direction of maze. 44 | self.max_y = 0 45 | # Maximum y direction of maze. 46 | self.max_x = 0 47 | # transition probability matrix. 48 | self.P = {} 49 | # location of start. 50 | self.start = 0 51 | # location of goal in the maze. 52 | self.goal = 0 53 | # Stores all the wall locations in the tuple. 54 | self.walls = () 55 | # Stores all the trap locations in the tuple. 56 | self.traps = () 57 | # Stores the goal location using coordinates e.g. (x, y) = (0, 1) 58 | self.goal_loc = () 59 | # Stores the start location using coordinates e.g. (x, y) = (0, 1) 60 | self.start_loc = () 61 | # Stores all the wall locations using coordinates e.g. (x, y) = (0, 1) 62 | self.walls_loc = () 63 | # Stores all the trap locations using coordinates e.g. (x, y) = (0, 1) 64 | self.traps_loc = () 65 | # Stores the available states 66 | self.ava_states = () 67 | 68 | 69 | def set_environment(self): 70 | """ 71 | Set the variables of environment. 72 | :return: 73 | """ 74 | self.shape = self.maze.shape 75 | self.nS = np.prod(self.shape) 76 | 77 | self.max_y = self.shape[0] 78 | self.max_x = self.shape[1] 79 | self.grid = np.arange(self.nS).reshape(self.shape) 80 | 81 | self.ava_states = np.where(self.maze != '1') 82 | 83 | itemindex_g = np.where(self.maze=='G') 84 | self.goal_loc = itemindex_g 85 | self.goal = self.grid[itemindex_g[0][0]][itemindex_g[1][0]] 86 | 87 | itemindex_s = np.where(self.maze=='S') 88 | self.start_loc = itemindex_s 89 | self.start = self.grid[itemindex_s[0][0]][itemindex_s[1][0]] 90 | 91 | itemindex_1 = np.where(self.maze=='1') 92 | self.walls_loc = itemindex_1 93 | wall_list = [] 94 | for var in range(len(self.walls_loc[0])): 95 | wall_list.append(self.grid[itemindex_1[0][var]][itemindex_1[1][var]]) 96 | self.walls = tuple(wall_list) 97 | 98 | itemindex_t = np.where(self.maze=='T') 99 | self.traps_loc = itemindex_t 100 | trap_list = [] 101 | for var in range(len(self.traps_loc[0])): 102 | trap_list.append(self.grid[itemindex_t[0][var]][itemindex_t[1][var]]) 103 | self.traps = tuple(trap_list) 104 | 105 | self.maze_vec = self.maze.flatten() 106 | 107 | 108 | def build_transition_probability_matrix(self, cost_function): 109 | """ 110 | Set the transition probabilities for the given maze and const function 111 | :param cost_function: set the transition probability matrix using the given cost function. 112 | 113 | :return: 114 | """ 115 | it = np.nditer(self.grid, flags=['multi_index']) 116 | while not it.finished: 117 | s = it.iterindex 118 | y, x = it.multi_index 119 | 120 | # Skip the transition probabilities for the wall 121 | if self.maze[y][x] == '1': 122 | it.iternext() 123 | continue 124 | 125 | # P[s][a] = (prob, next_state, reward, is_done) 126 | self.P[s] = {a : [] for a in range(self.nA)} 127 | 128 | is_goal_reached = lambda s: s == self.goal 129 | 130 | # We're stuck in a terminal state 131 | if is_goal_reached(s): 132 | self.P[s][IDLE] = [(1.0, s, 0, True)] 133 | # Not a terminal state 134 | else: 135 | for a in self.available_actions(s): 136 | self.P[s][a] = self.determine_probabilities(s, a, cost_function) 137 | 138 | it.iternext() 139 | 140 | # if the next state is wall, then probability of that action is zero. 141 | # only available actions are input 142 | def determine_probabilities(self, state, a, cost_function): 143 | """ 144 | Determine the probabilities for the given state action set. Remind that the floor is slippery. 145 | :param state: Current state number. 146 | :param a: Action on the state. 147 | :return: List of possible actions for the given state and action. 148 | """ 149 | action_pro = [] 150 | is_goal_reached = lambda s: s == self.goal 151 | adjacents = 0 152 | if a == UP: 153 | ns = state - self.max_x 154 | # left adjacent of next state because of the slippery floor. 155 | ns_l = ns-1 156 | # right adjacent of next state because of the slippery floor. 157 | ns_r = ns+1 158 | # then left adjacent of up is not wall. 159 | if self.maze.flat[ns_l] != '1': 160 | action_pro.append(tuple((PRO, ns_l, self.one_step_cost(state, ns_l, cost_function), is_goal_reached(ns_l)))) 161 | adjacents += 1 162 | if self.maze.flat[ns_r] != '1': 163 | action_pro.append(tuple((PRO, ns_r, self.one_step_cost(state, ns_r, cost_function), is_goal_reached(ns_r)))) 164 | adjacents += 1 165 | action_pro.append(tuple((1- adjacents*PRO, ns, self.one_step_cost(state, ns, cost_function), is_goal_reached(ns)))) 166 | elif a == RIGHT: 167 | ns = state+1 168 | # left adjacent of next state because of the slippery floor. 169 | ns_l = ns-self.max_x 170 | # right adjacent of next state because of the slippery floor. 171 | ns_r = ns+self.max_x 172 | # then left adjacent of right is not wall. 173 | if self.maze.flat[ns_l] != '1': 174 | action_pro.append(tuple((PRO, ns_l, self.one_step_cost(state, ns_l, cost_function), is_goal_reached(ns_l)))) 175 | adjacents += 1 176 | if self.maze.flat[ns_r] != '1': 177 | action_pro.append(tuple((PRO, ns_r, self.one_step_cost(state, ns_r, cost_function), is_goal_reached(ns_r)))) 178 | adjacents += 1 179 | action_pro.append(tuple((1 - adjacents*PRO, ns, self.one_step_cost(state, ns, cost_function), is_goal_reached(ns)))) 180 | 181 | elif a == DOWN: 182 | ns = state + self.max_x 183 | # left adjacent of next state because of the slippery floor. 184 | ns_l = ns+1 185 | # right adjacent of next state because of the slippery floor. 186 | ns_r = ns-1 187 | # then left adjacent of right is not wall. 188 | if self.maze.flat[ns_l] != '1': 189 | action_pro.append(tuple((PRO, ns_l, self.one_step_cost(state, ns_l, cost_function), is_goal_reached(ns_l)))) 190 | adjacents += 1 191 | if self.maze.flat[ns_r] != '1': 192 | action_pro.append(tuple((PRO, ns_r, self.one_step_cost(state, ns_r, cost_function), is_goal_reached(ns_r)))) 193 | adjacents += 1 194 | action_pro.append(tuple((1- adjacents*PRO, ns, self.one_step_cost(state, ns, cost_function), is_goal_reached(ns)))) 195 | 196 | elif a == LEFT: 197 | ns = state - 1 198 | # left adjacent of next state because of the slippery floor. 199 | ns_l = ns + self.max_x 200 | # right adjacent of next state because of the slippery floor. 201 | ns_r = ns - self.max_x 202 | # then left adjacent of right is not wall. 203 | if self.maze.flat[ns_l] != '1': 204 | action_pro.append(tuple((PRO, ns_l, self.one_step_cost(state, ns_l, cost_function), is_goal_reached(ns_l)))) 205 | adjacents += 1 206 | if self.maze.flat[ns_r] != '1': 207 | action_pro.append(tuple((PRO, ns_r, self.one_step_cost(state, ns_r, cost_function), is_goal_reached(ns_r)))) 208 | adjacents += 1 209 | action_pro.append(tuple((1- adjacents*PRO, ns, self.one_step_cost(state, ns, cost_function), is_goal_reached(ns)))) 210 | 211 | # If it is a IDLE action at the state. 212 | else: 213 | action_pro.append(tuple((1.0, state, self.one_step_cost(state, state, cost_function), is_goal_reached(state)))) 214 | 215 | return action_pro 216 | 217 | def available_actions(self, state): 218 | """ 219 | This function is necessary to determine the available actions for the given states. 220 | :param state: Current state in the maze. 221 | :return: List of actions that is available for that state. 222 | """ 223 | actions = [] 224 | ns_up = state - self.max_x 225 | ns_right = state + 1 226 | ns_down = state + self.max_x 227 | ns_left = state - 1 228 | # If goal state is reached 229 | actions.append(IDLE) 230 | if self.maze.flat[state] == 'G': 231 | return actions 232 | if self.maze.flat[ns_up] != '1': 233 | actions.append(UP) 234 | if self.maze.flat[ns_right] != '1': 235 | actions.append(RIGHT) 236 | if self.maze.flat[ns_down] != '1': 237 | actions.append(DOWN) 238 | if self.maze.flat[ns_left] != '1': 239 | actions.append(LEFT) 240 | 241 | return actions 242 | 243 | def one_step_cost(self, state, next_state, cost_function): 244 | """ 245 | This function exploits two cost functions where there is no cost for transitioning into the terminal goal state. 246 | Also implements cost for each action. 247 | :param state: Current state 248 | :param next_state: Next state 249 | :return: total cost for the action 250 | """ 251 | # define the initial cost as zero 252 | cost = 0 253 | # Trap affects both costs. 254 | if self.maze.flat[next_state] == 'T': 255 | cost = cost + COST_TRAP 256 | 257 | if cost_function == 1: 258 | if self.maze.flat[next_state] == 'G': 259 | cost = cost + GOAL_REACHED 260 | # For other actions, there is no cost. 261 | 262 | elif cost_function == 2: 263 | #if self.maze.flat[next_state] != 'G': 264 | cost = cost + COST_ENERGY 265 | 266 | else: 267 | print("Undefined cost function is selected.") 268 | 269 | return cost 270 | -------------------------------------------------------------------------------- /scripts/maze_solver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Approximate Dynamic Programming & Reinforcement Learning - WS 2018 3 | Programming Assignment 4 | Alperen Gundogan 5 | 6 | 30.01.2019 7 | 8 | Command necessary to test the code. 9 | python main.py maze.txt 10 | """ 11 | #from __future__ import division 12 | from maze_environment import Maze_env 13 | 14 | import numpy as np 15 | import matplotlib.pyplot as plt 16 | from matplotlib import cm 17 | 18 | # Determine whether goal state skipped or not. 19 | # This is implemented just for some tests. 20 | SKIP_GOAL_STATE = 0 21 | GRUND_TRUTH = 0.99 22 | 23 | class Maze_solver(object): 24 | 25 | def __init__(self): 26 | self.maze_env = Maze_env() 27 | self.policy = [] 28 | self.values = [] 29 | # initialize as zero. 30 | # holds the grund truth for both g1 and g2 respectively. 31 | self.gt_PI= [] 32 | self.gt_VI = [] 33 | self.algorithm_name = "" 34 | 35 | 36 | def read_file(self, path): 37 | """ 38 | Read the maze. 39 | :param path: 40 | :return: 41 | """ 42 | maze = [] 43 | with open(path) as f: 44 | for line in f.read().splitlines(): 45 | line = line.replace(" ", "") 46 | if not line.startswith("#"): 47 | maze.append(list(line)) 48 | self.maze_env.maze = maze 49 | self.maze_env.maze = np.asarray(self.maze_env.maze) 50 | self.maze_env.set_environment() 51 | 52 | def policy_evaluation(self, discount_factor, max_iteration, theta=1e-25): 53 | """ 54 | Runs the policy evaluation algorithm. 55 | :param discount_factor: 56 | :param max_iteration: 57 | :param theta: 58 | :return: returns the converged value function and total number of iterations. 59 | """ 60 | V = np.zeros(self.maze_env.nS) 61 | it_eval = 0 62 | for i in range(max_iteration): 63 | delta = 0 64 | it_eval += 1 65 | # For each state, perform a "full backup" 66 | for s in range(self.maze_env.nS): 67 | v = 0 68 | if s in self.maze_env.walls: 69 | continue 70 | # Look at the possible next actions 71 | for a, action_prob in enumerate(self.policy[s]): 72 | # For each action, look at the possible next states... 73 | for prob, next_state, reward, done in self.maze_env.P[s][a]: 74 | # Calculate the expected value. 75 | v += action_prob * prob * (reward + discount_factor * V[next_state]) 76 | # How much our value function changed (across any states) 77 | delta = max(delta, np.abs(v - V[s])) 78 | V[s] = v 79 | # Stop evaluating once our value function change is below a threshold 80 | if delta < theta: 81 | break 82 | return np.array(V), it_eval 83 | 84 | def policy_iteration(self, discount_factor, max_iteration): 85 | """ 86 | Runs the policy iteration algorithm. We first create random policy and then 87 | runs policy evaluation and policy improvement algorithms, respectively. 88 | :param discount_factor: 89 | :param max_iteration: 90 | :return: Returns the optimal policy and cost function with the total number of iterations. 91 | """ 92 | 93 | self.policy = self.create_random_policy() 94 | self.values = [] 95 | it = 0 96 | while True: 97 | # Evaluation of current policy 98 | V, iter = self.policy_evaluation(discount_factor, max_iteration) 99 | # this will be set to false if we make any changes. 100 | self.values.append(V[self.maze_env.start]) 101 | optimal_policy = True 102 | it += 1 103 | for s in range(self.maze_env.nS): 104 | # the walls also part of the state but we skip it since there can be no actions inside the wall. 105 | if s in self.maze_env.walls: 106 | continue 107 | 108 | chosen_a = np.argmax(self.policy[s]) 109 | 110 | A = self.values_of_actions(s, V, discount_factor) 111 | 112 | # Choose the best action which minimize the value function 113 | best_a = np.argmin(A) 114 | 115 | # Greedily update the policy 116 | if chosen_a != best_a: 117 | optimal_policy = False 118 | self.policy[s] = np.eye(self.maze_env.nA)[best_a] 119 | 120 | if optimal_policy or it == max_iteration: 121 | if discount_factor == GRUND_TRUTH: 122 | self.gt_PI.append(V[self.maze_env.start]) 123 | return self.policy, V, it 124 | 125 | def values_of_actions(self, state, V, discount_factor): 126 | """ 127 | For the given value function and the state and discount factor, returns 128 | the values of each available action on that state. 129 | :param state: 130 | :param V: values of state, array. 131 | :param discount_factor: 132 | :return: values of each available action on that state 133 | """ 134 | # Find the values of each action by looking successor states. 135 | A = np.zeros(self.maze_env.nA) 136 | av_ac = self.maze_env.available_actions(state) 137 | for a in range(self.maze_env.nA): 138 | if a in av_ac: 139 | for prob, next_state, reward, done in self.maze_env.P[state][a]: 140 | A[a] += prob * (reward + discount_factor * V[next_state]) 141 | else: 142 | A[a] = np.inf 143 | 144 | return A 145 | 146 | def value_iteration(self, discount_factor, max_iteration, theta=1e-25): 147 | """ 148 | Runs the value iteration algorithm. 149 | :param discount_factor: 150 | :param max_iteration: 151 | :param theta: 152 | :return: Returns the optimal policy, value function and total number of iteration for algorithm. 153 | """ 154 | V = np.zeros(self.maze_env.nS) 155 | it = 0 156 | while it != max_iteration: 157 | # Condition for stop 158 | delta = 0 159 | # increase the iteration 160 | it += 1 161 | # Update each state 162 | for s in range(self.maze_env.nS): 163 | # the walls also part of the state but we skip it since there can be no actions inside the wall. 164 | if s in self.maze_env.walls: 165 | continue 166 | # Find the values of each action by looking successor states. 167 | A = self.values_of_actions(s, V, discount_factor) 168 | best_action_value = np.min(A) 169 | # Calculate delta across all states seen so far 170 | delta = max(delta, np.abs(best_action_value - V[s])) 171 | # Update the value function. Ref: Sutton book eq. 4.10. 172 | V[s] = best_action_value 173 | self.values.append(V[self.maze_env.t]) 174 | # Check if we can stop 175 | if delta < theta: 176 | break 177 | 178 | # Create "the" policy based on the optimal value function. 179 | policy = np.zeros([self.maze_env.nS, self.maze_env.nA]) 180 | 181 | for s in range(self.maze_env.nS): 182 | # the walls also part of the state but we skip it since there can be no actions inside the wall. 183 | if s in self.maze_env.walls: 184 | continue 185 | # Find the best action for this state using the corresponding values of each action. 186 | A = self.values_of_actions(s, V, discount_factor) 187 | best_action = np.argmin(A) 188 | # Always take the best action 189 | policy[s, best_action] = 1.0 190 | 191 | if discount_factor == GRUND_TRUTH: 192 | self.gt_VI.append(V[self.maze_env.start]) 193 | return policy, V, it 194 | 195 | def create_random_policy(self): 196 | """ 197 | This function creates uniform random policy. 198 | :return: 199 | """ 200 | #self.policy = np.zeros([self.maze_env.nS, self.maze_env.nA]) 201 | policy = np.zeros([self.maze_env.nS, self.maze_env.nA]) 202 | it = np.nditer(self.maze_env.grid, flags=['multi_index']) 203 | while not it.finished: 204 | s = it.iterindex 205 | y, x = it.multi_index 206 | # Skip the transition probabilities for the wall. 207 | if self.maze_env.maze[y][x] == '1': 208 | it.iternext() 209 | continue 210 | # determine the available actions for the given state. 211 | actions = self.maze_env.available_actions(s) 212 | for a in actions: 213 | policy[s][a] = 1.0/len(actions) 214 | 215 | it.iternext() 216 | return policy 217 | 218 | def show_maze(self): 219 | """ 220 | This function call shows the maze, if it is needed. 221 | :return: 222 | """ 223 | plt.grid('on') 224 | nrows, ncols = self.maze_env.maze.shape 225 | ax = plt.gca() 226 | ax.set_xticks(np.arange(0.5, nrows+1, 1)) 227 | ax.set_yticks(np.arange(0.5, ncols+1, 1)) 228 | ax.set_xticklabels([]) 229 | ax.set_yticklabels([]) 230 | canvas = np.ones((nrows, ncols)) 231 | 232 | for var in range(len(self.maze_env.goal_loc[0])): 233 | row = self.maze_env.goal_loc[0][var] 234 | col = self.maze_env.goal_loc[1][var] 235 | canvas[row,col] = 0.5 236 | for var in range(len(self.maze_env.start_loc[0])): 237 | row = self.maze_env.start_loc[0][var] 238 | col = self.maze_env.start_loc[1][var] 239 | canvas[row,col] = 0.3 240 | for var in range(len(self.maze_env.traps_loc[0])): 241 | row = self.maze_env.traps_loc[0][var] 242 | col = self.maze_env.traps_loc[1][var] 243 | canvas[row,col] = 0.7 244 | for var in range(len(self.maze_env.walls_loc[0])): 245 | row = self.maze_env.walls_loc[0][var] 246 | col = self.maze_env.walls_loc[1][var] 247 | canvas[row,col] = 0.0 248 | 249 | canvas = np.array(canvas, dtype=float) 250 | 251 | img = plt.imshow(canvas, interpolation='none', cmap='gray') 252 | 253 | return img 254 | 255 | def plot_value_function(self, text): 256 | """ 257 | Plots the value function at start position with the number of iterations. 258 | :param text: 259 | :return: 260 | """ 261 | plt.clf() 262 | plt.plot(self.values) 263 | plt.ylabel('Values') 264 | plt.xlabel('Iterations') 265 | plt.title(text) 266 | plt.show() 267 | 268 | def plot_error(self, gt, values, title): 269 | """ 270 | Plots the squared distance error. 271 | :param gt: Ground truth 272 | :param values: Values of start position w.r.t the number of iterations. 273 | :param title: 274 | :return: 275 | """ 276 | #plt.clf() 277 | fig = plt.figure() 278 | errors = [] 279 | #gt = gt * np.ones(len(values)) 280 | #dist = np.linalg.norm(gt - values) 281 | for i in range(len(values)): 282 | errors.append(np.sqrt((gt-values[i])**2)) 283 | 284 | iter = range(1, len(values)+1) 285 | 286 | # 287 | plt.plot(iter, errors) 288 | plt.ylabel('Squared Distance') 289 | plt.xlabel('Iterations') 290 | plt.title(title) 291 | plt.tight_layout() 292 | title = title + "_error" 293 | directory = "plots/" 294 | #plt.savefig(title+"."+"png", format="png", dpi = 1200) 295 | #plt.savefig(directory+title+"."+"pdf", format="pdf", dpi = 1200) 296 | #plt.show() 297 | 298 | def visulaze_results(self, V, p, title, save = False, format = "png",): 299 | """ 300 | This function creates heatmap and quiver plot for the given value function and policy. 301 | 302 | :param V: "The" (optimal) value function. 303 | :param p: "The" (optimal) policy. 304 | :param title: Title of the plot. 305 | :param save: 306 | :param format: 307 | :return: 308 | """ 309 | #plt.clf() 310 | fig, ax = plt.subplots() 311 | nrows = self.maze_env.max_y 312 | ncols = self.maze_env.max_x 313 | V = np.reshape(V, (nrows, ncols)) 314 | p_shaped = np.reshape(np.argmax(p, axis=1), self.maze_env.shape) 315 | for var in range(len(self.maze_env.walls_loc[0])): 316 | row = self.maze_env.walls_loc[0][var] 317 | col = self.maze_env.walls_loc[1][var] 318 | V[row][col] = np.nan 319 | p_shaped[row][col] = -1 320 | # masked array to hold walls 321 | masked_array = np.ma.array (V, mask=np.isnan(V)) 322 | current_cmap = cm.get_cmap() 323 | current_cmap.set_bad('black') 324 | im = ax.imshow(masked_array, cmap=current_cmap) 325 | plt.colorbar(im, fraction=0.046, pad=0.04) 326 | plt.title(title) 327 | 328 | 329 | y_pos = self.maze_env.ava_states[0] 330 | x_pos = self.maze_env.ava_states[1] 331 | x_direct, y_direct = self.helper_quiver_plot(p_shaped) 332 | ax.quiver(x_pos,y_pos,x_direct,y_direct, scale=20) 333 | 334 | plt.tight_layout() 335 | 336 | #directory = "plots/" 337 | #plt.savefig(title+"."+"png", format="png", dpi = 1200) 338 | #plt.savefig(directory+title+"."+"pdf", format="pdf", dpi = 1200) 339 | 340 | 341 | def helper_quiver_plot(self, p_shaped): 342 | """ 343 | This function helps to plot quiver. 344 | :param p_shaped: policy array with the shape of the maze. 345 | :return: 346 | """ 347 | x_direct = [] 348 | y_direct = [] 349 | 350 | for j in range(self.maze_env.max_y): 351 | for i in range(self.maze_env.max_x): 352 | # skip if it is a wall 353 | if p_shaped[j][i] == -1: 354 | continue 355 | # add the up. 356 | if p_shaped[j][i] == 0: 357 | x_direct.append(0) 358 | y_direct.append(1) 359 | # Right. 360 | elif p_shaped[j][i] == 1: 361 | x_direct.append(1) 362 | y_direct.append(0) 363 | # Down. 364 | elif p_shaped[j][i] == 2: 365 | x_direct.append(0) 366 | y_direct.append(-1) 367 | # Right. 368 | elif p_shaped[j][i] == 3: 369 | x_direct.append(-1) 370 | y_direct.append(0) 371 | # Idle 372 | elif p_shaped[j][i] == 4: 373 | x_direct.append(0) 374 | y_direct.append(0) 375 | 376 | return x_direct, y_direct 377 | 378 | 379 | 380 | --------------------------------------------------------------------------------