├── Learner.py ├── README.md ├── World.py ├── example_grid.jpg └── q_learning_sample └── main.py /Learner.py: -------------------------------------------------------------------------------- 1 | from World import World 2 | import threading 3 | import time 4 | 5 | def init_vars(game): 6 | discount = 0.3 7 | actions = game.ACTIONS 8 | states = [] 9 | Q = {} 10 | 11 | # Create base for Q-matrix 12 | for i in range(game.AXIS_X): 13 | for j in range(game.AXIS_Y): 14 | states.append((i, j)) 15 | 16 | for state in states: 17 | temp = {} 18 | for action in [item[0] for item in actions]: 19 | temp[action] = 0.1 20 | Q[state] = temp 21 | 22 | for (i, j, c, w) in game.OBJECTS: 23 | for action in [item[0] for item in actions]: 24 | Q[(i, j)][action] = w 25 | 26 | return discount, actions, Q 27 | 28 | def do_action(action, game, actions): 29 | s = game.Player 30 | r = -game.score 31 | coords = [i for i in actions if i[0] == action] 32 | 33 | x, y = coords[0][1:] 34 | game.try_move(x, y) 35 | 36 | s2 = game.Player 37 | r += game.score 38 | return s, action, r, s2 39 | 40 | 41 | def max_Q(s, Q): 42 | val = max(list(Q[s].values())) 43 | pos = [AXIS_X for AXIS_X in Q[s].items() if val == AXIS_X[1]] 44 | act, val = pos[0] 45 | 46 | return act, val 47 | 48 | def inc_Q(s, a, alpha, inc, Q): 49 | Q[s][a] *= 1 - alpha 50 | Q[s][a] += alpha * inc 51 | 52 | def render_game(game): 53 | game.render_grid() 54 | game.board.grid(row=0, column=0) 55 | base_1 = game.WIDTH+(game.WIDTH*0.2) 56 | base_2 = game.WIDTH+(game.WIDTH*0.8) 57 | 58 | game.me = game.board.create_rectangle(game.Player[0]*base_1, game.Player[1]*base_2, 59 | game.Player[0]*base_1, game.Player[1]*base_2, 60 | fill="orange", width=1, tag="me") 61 | 62 | def run(discount, game, Q, actions): 63 | time.sleep(1) 64 | alpha = 1 65 | t = 1 66 | while True: 67 | # Pick the right action 68 | s = game.Player 69 | max_act, max_val = max_Q(s, Q) 70 | (s, a, r, s2) = do_action(max_act, game, actions) 71 | 72 | # Update Q 73 | max_act, max_val = max_Q(s2, Q) 74 | inc_Q(s, a, alpha, r + discount * max_val, Q) 75 | 76 | # Check if the game has restarted 77 | t += 1.0 78 | if game.has_restarted(): 79 | game.restart_game() 80 | time.sleep(0.01) 81 | t = 1.0 82 | 83 | # Update the learning rate 84 | alpha = pow(t, -0.1) 85 | 86 | # MODIFAXIS_y THIS SLEEP IF THE GAME IS GOING TOO FAST. 87 | time.sleep(0.3) 88 | 89 | # render game 90 | render_game(game) 91 | 92 | def main(): 93 | # Init game -- input: width, x_axis & y_axis 94 | game = World(100, 5, 5) 95 | 96 | # Init vars 97 | discount, actions, Q = init_vars(game) 98 | 99 | # Start game 100 | t = threading.Thread(target=run, args=(discount, game, Q, actions)) 101 | t.daemon = True 102 | t.start() 103 | game.start_game() 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Introduction 2 | ========================== 3 | The challenge was to build a Q-learning algorithm by using existing code (which I referenced in the credits). The header mprovements describes which improvements I made. 4 | 5 | Run 6 | ========================== 7 | Run 'Python3 Learner.py' 8 | 9 | Improvements 10 | ========================== 11 | I improved the code by changing the following: 12 | - Replace unnececary functions/code (LOTS of code cleaning). 13 | - Optimize by using list comprehension (loads of for loops were used, while this wasn't neccecary in all cases) 14 | - Use of classes ('World' is now a class, which in my opinion is a cleaner way of coding) 15 | - Merged walls/specials to objects variable. 16 | - Walls give a negative result of -1 to discourage the bot to go near. I could have used a very high value, 17 | but I believe we want the bot to totally avoid the red square and avoid (as much as possible) the walls. 18 | If we tell the player that there is a huge loss for touching the wall, then the bot won't go near the walls anymore, 19 | which is a problem since there are lots of them. We want the bot to know that it shouldn't go near the walls, 20 | but it should also know that there is a higher loss for going near the red squares. 21 | - After every reset the player position is changed so that the Q-matrix get initialized quicker. This way we have a higher chance that every cell is touched. 22 | - Extra function so that player/green square is not initialized on an already exisiting object position 23 | - Dynamic generation of matrix (user can give in his/her own dimensions). 24 | 25 | I thought about generating a random matrix everytime the game restarts. However this wouldn't make much sense, 26 | since you're trying to find the optimal path and by changing the matrix, the optimal path would change. So I decided 27 | to not make that change. 28 | 29 | # Example 30 | Below is an example of a randomly generated matrix (10x10). This is however not the max. The user can give in any 31 | value he/she wants. 32 | ![sample_grid_q_learning](https://github.com/mickvanhulst/q_learning/blob/master/example_grid.jpg) 33 | 34 | 35 | Summary 36 | ========================== 37 | Simple Reinforcement learning example, based on the Q-function. 38 | - Rules: The agent (yellow box) has to reach one of the goals to end the game (green or red cell). 39 | - Rewards: Each step gives a negative reward of -0.04. The red cell gives a negative reward of -5. The green one gives a positive reward of +5. The black walls give a negative reward of -1. 40 | - States: Each cell is a state the agent can be. 41 | - Actions: There are only 4 actions. Up, Down, Right, Left. 42 | 43 | Credits 44 | =========== 45 | Credit for the vast majority of code here goes to [PhilippeMorere](https://github.com/PhilippeMorere). 46 | 47 | Credits for being awesome go to @[Sirajology](https://www.youtube.com/sirajology) for enabling us to learn so much about ML! 48 | -------------------------------------------------------------------------------- /World.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tkinter import * 3 | master = Tk() 4 | 5 | class World(object): 6 | def __init__(self, width, axis_x, axis_y): 7 | # static 8 | self.WIDTH = width 9 | (self.AXIS_X, self.AXIS_Y) = (axis_x, axis_y) 10 | self.WALK_REWARD = -0.04 11 | self.ACTIONS = [["up", 0, -1], ["down", 0, 1], ["left", -1, 0], ["right", 1, 0]] 12 | self.N_OBJECTS = round((axis_x * axis_y) / 4) 13 | 14 | # Init objects, then add special green square 15 | self.OBJECTS = self.init_objects(axis_x, axis_y) 16 | x_green, y_green = self.gen_pos() 17 | self.OBJECTS.append((x_green, y_green, "green", 5)) 18 | 19 | # init remaining variables 20 | self.board = Canvas(master, width=self.AXIS_X*self.WIDTH, height=self.AXIS_Y*self.WIDTH) 21 | self.score = 1 22 | self.restart = False 23 | self.Player = self.gen_pos() 24 | self.me = self.board.create_rectangle(self.Player[0]*self.WIDTH+self.WIDTH*2/10, self.Player[1]*self.WIDTH+self.WIDTH*2/10, 25 | self.Player[0]*self.WIDTH+self.WIDTH*8/10, self.Player[1]*self.WIDTH+self.WIDTH*8/10, fill="orange", width=1, tag="me") 26 | 27 | def init_objects(self, axis_x, axis_y): 28 | objects = set([]) 29 | n_of_red_squares = round(self.N_OBJECTS / 5) 30 | 31 | for i in range(self.N_OBJECTS): 32 | # First add N amount of walls, then add N amount of OBJECTS to objects 33 | if i < n_of_red_squares: 34 | # add red squares 35 | objects.add((np.random.randint(0, axis_x), np.random.randint(0, axis_y), "red", -5)) 36 | else: 37 | #add walls 38 | objects.add((np.random.randint(0, axis_x), np.random.randint(0, axis_y), "black", -1)) 39 | 40 | return list(objects) 41 | 42 | def gen_pos(self): 43 | while True: 44 | x = np.random.randint(0, self.AXIS_X) 45 | y = np.random.randint(0, self.AXIS_Y) 46 | check_for_rows = [e for e in self.OBJECTS if x == e[0] and y == e[1]] 47 | 48 | if not check_for_rows: 49 | break 50 | return (x, y) 51 | 52 | def render_grid(self): 53 | for i in range(self.AXIS_X): 54 | for j in range(self.AXIS_Y): 55 | # Check if objects for current coordinates 56 | objects = [e for e in self.OBJECTS if i == e[0] and j == e[1]] 57 | 58 | if objects and ((i, j) == objects[0][0:2]): 59 | self.board.create_rectangle(i*self.WIDTH, j*self.WIDTH, (i+1)*self.WIDTH, 60 | (j+1)*self.WIDTH, fill=objects[0][2], width=1) 61 | else: 62 | self.board.create_rectangle(i*self.WIDTH, j*self.WIDTH, (i+1)*self.WIDTH, 63 | (j+1)*self.WIDTH, fill="white", width=1) 64 | 65 | def start_game(self): 66 | master.mainloop() 67 | 68 | def try_move(self, dx, dy): 69 | if self.restart: 70 | restart_game() 71 | new_x = self.Player[0] + dx 72 | new_y = self.Player[1] + dy 73 | self.score += self.WALK_REWARD 74 | 75 | if (new_x >= 0) and (new_x < self.AXIS_X) and (new_y >= 0) and (new_y < self.AXIS_Y) and not ((new_x, new_y) in self.OBJECTS): 76 | self.board.coords(self.me, new_x*self.WIDTH+self.WIDTH*2/10, new_y*self.WIDTH+self.WIDTH*2/10, 77 | new_x*self.WIDTH+self.WIDTH*8/10, new_y*self.WIDTH+self.WIDTH*8/10) 78 | 79 | self.Player = (new_x, new_y) 80 | 81 | for (i, j, c, w) in self.OBJECTS: 82 | if new_x == i and new_y == j: 83 | self.score -= self.WALK_REWARD 84 | self.score += w 85 | if self.score > 0: 86 | print("Success! score: ", self.score) 87 | else: 88 | print("Fail! score: ", self.score) 89 | self.restart = True 90 | return 91 | 92 | 93 | def restart_game(self): 94 | self.Player = self.gen_pos() 95 | self.score = 1 96 | self.restart = False 97 | self.board.coords(self.me, self.Player[0]*self.WIDTH+self.WIDTH*2/10, 98 | self.Player[1]*self.WIDTH+self.WIDTH*2/10, self.Player[0]*self.WIDTH+self.WIDTH*8/10, 99 | self.Player[1]*self.WIDTH+self.WIDTH*8/10) 100 | 101 | def has_restarted(self): 102 | return self.restart -------------------------------------------------------------------------------- /example_grid.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mickvanhulst/q_learning/b1986306b8e7c930fd6ee7d583a35d40a0997b33/example_grid.jpg -------------------------------------------------------------------------------- /q_learning_sample/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # R matrix 4 | # -1 are 'special objects', add -100 for instant error objects 5 | R = np.matrix([ [-1,-1,-1,-1,0,-1], 6 | [-1,-1,-1,0,-1,100], 7 | [-1,-1,-100,0,-1,-1], 8 | [-1,0,0,-1,0,-100], 9 | [-1,0,0,-100,-1,100], 10 | [-100,0,-1,-1,0,100] ]) 11 | 12 | # Q matrix 13 | Q = np.matrix(np.zeros([6,6])) 14 | 15 | # Gamma (learning parameter). 16 | gamma = 0.8 17 | 18 | # Initial state. (Usually to be chosen at random) 19 | initial_state = 1 20 | 21 | # This function returns all available actions in the state given as an argument 22 | def available_actions(state): 23 | current_state_row = R[state,] 24 | av_act = np.where(current_state_row >= 0)[1] 25 | return av_act 26 | 27 | # Get available actions in the current state 28 | available_act = available_actions(initial_state) 29 | 30 | # This function chooses at random which action to be performed within the range 31 | # of all the available actions. 32 | def sample_next_action(available_actions_range): 33 | next_action = int(np.random.choice(available_act,1)) 34 | return next_action 35 | 36 | # Sample next action to be performed 37 | action = sample_next_action(available_act) 38 | 39 | # This function updates the Q matrix according to the path selected and the Q 40 | # learning algorithm 41 | def update(current_state, action, gamma): 42 | 43 | max_index = np.where(Q[action,] == np.max(Q[action,]))[1] 44 | 45 | if max_index.shape[0] > 1: 46 | max_index = int(np.random.choice(max_index, size = 1)) 47 | else: 48 | max_index = int(max_index) 49 | max_value = Q[action, max_index] 50 | 51 | # Q learning formula 52 | Q[current_state, action] = R[current_state, action] + gamma * max_value 53 | 54 | # Update Q matrix 55 | update(initial_state,action,gamma) 56 | 57 | #------------------------------------------------------------------------------- 58 | # Training 59 | 60 | # Train over 10 000 iterations. (Re-iterate the process above). 61 | for i in range(10000): 62 | current_state = np.random.randint(0, int(Q.shape[0])) 63 | available_act = available_actions(current_state) 64 | action = sample_next_action(available_act) 65 | update(current_state,action,gamma) 66 | 67 | # Normalize the "trained" Q matrix 68 | print("Trained Q matrix:") 69 | print(Q/np.max(Q)*100) 70 | 71 | #------------------------------------------------------------------------------- 72 | # Testing 73 | 74 | # Goal state = 5 75 | # Best sequence path starting from 2 -> 2, 3, 1, 5 76 | 77 | current_state = 2 78 | steps = [current_state] 79 | 80 | while current_state != 5: 81 | 82 | next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1] 83 | 84 | if next_step_index.shape[0] > 1: 85 | next_step_index = int(np.random.choice(next_step_index, size = 1)) 86 | else: 87 | next_step_index = int(next_step_index) 88 | 89 | steps.append(next_step_index) 90 | current_state = next_step_index 91 | 92 | # Print selected sequence of steps 93 | print("Selected path:") 94 | print(steps) 95 | 96 | #------------------------------------------------------------------------------- 97 | # OUTPUT 98 | #------------------------------------------------------------------------------- 99 | # 100 | # Trained Q matrix: 101 | #[[ 0. 0. 0. 0. 80. 0. ] 102 | # [ 0. 0. 0. 64. 0. 100. ] 103 | # [ 0. 0. 0. 64. 0. 0. ] 104 | # [ 0. 80. 51.2 0. 80. 0. ] 105 | # [ 0. 80. 51.2 0. 0. 100. ] 106 | # [ 0. 80. 0. 0. 80. 100. ]] 107 | # 108 | # Selected path: 109 | # [2, 3, 1, 5] 110 | # --------------------------------------------------------------------------------