├── Game-TestHuman.py ├── LICENSE ├── README.md ├── Reinforcement Learning (RL) Using Python.ipynb ├── Train_GridSearch.py └── images ├── EnvExp.jpg ├── gifs ├── EnvPlayed.gif └── envExp.gif └── wall.jpg /Game-TestHuman.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | from collections import deque 3 | from time import sleep 4 | import pygame 5 | 6 | pygame.init() 7 | 8 | class Field: 9 | def __init__(self, height=10, width=5): 10 | self.width = width 11 | self.height = height 12 | self.body = [[0] * width for _ in range(height)] 13 | 14 | def update_field(self, walls, player): 15 | self.body = [[0] * self.width for _ in range(self.height)] 16 | 17 | for wall in walls: 18 | if not wall.out_of_range: 19 | for i in range(wall.y, min(wall.y + wall.height, self.height)): 20 | self.body[i][:] = wall.body[i - wall.y][:] 21 | 22 | for i in range(player.y, min(player.y + player.height, self.height)): 23 | for j in range(player.x, min(player.x + player.width, self.width)): 24 | self.body[i][j] = player.body[i - player.y][j - player.x] 25 | 26 | 27 | class Wall: 28 | def __init__(self, height=5, width=100, hole_width=20, y=0, speed=1): 29 | self.height = height 30 | self.width = width 31 | self.hole_width = hole_width 32 | self.y = y 33 | self.speed = speed 34 | self.body_unit = 1 35 | self.body = [[self.body_unit] * width for _ in range(height)] 36 | self.out_of_range = False 37 | self.create_hole() 38 | 39 | def create_hole(self): 40 | hole_pos = randint(0, self.width - self.hole_width) 41 | for i in range(self.hole_width): 42 | self.body[self.height // 2][hole_pos + i] = 0 43 | 44 | def move(self): 45 | self.y += self.speed 46 | self.out_of_range = self.y + self.height > field.height 47 | 48 | 49 | class Player: 50 | def __init__(self, height=5, max_width=10, width=2, x=0, y=0, speed=2): 51 | self.height = height 52 | self.max_width = max_width 53 | self.width = width 54 | self.x = x 55 | self.y = y 56 | self.speed = speed 57 | self.body_unit = 2 58 | self.body = [[self.body_unit] * width for _ in range(height)] 59 | 60 | def move(self, direction=0): 61 | if direction == 1 and self.x > 0: 62 | self.x -= self.speed 63 | elif direction == 2 and self.x + self.width < field.width: 64 | self.x += self.speed 65 | 66 | 67 | class Environment: 68 | def __init__(self): 69 | self.BLACK = (25, 25, 25) 70 | self.WHITE = (255, 255, 255) 71 | self.RED = (255, 80, 80) 72 | self.field = self.walls = self.player = None 73 | self.current_state = self.reset() 74 | 75 | def reset(self): 76 | self.field = Field() 77 | self.walls = deque([Wall()]) 78 | self.player = Player(x=field.width // 2 - 1, y=field.height - 5) 79 | return self.field.body 80 | 81 | def step(self, action): 82 | reward = 0 83 | 84 | if action == 1 or action == 2: 85 | self.player.move(action) 86 | 87 | for wall in self.walls: 88 | wall.move() 89 | 90 | self.field.update_field(self.walls, self.player) 91 | 92 | if self.walls[-1].y == self.player.y + self.player.height: 93 | reward += 1 94 | 95 | return self.field.body, reward 96 | 97 | def render(self, window): 98 | window.fill(self.WHITE) 99 | 100 | for r in range(field.height): 101 | for c in range(field.width): 102 | color = self.WHITE if self.field.body[r][c] == 0 else self.BLACK 103 | pygame.draw.rect(window, color, (c * 40, r * 30, 40, 30)) 104 | 105 | pygame.display.update() 106 | 107 | 108 | env = Environment() 109 | field = env.field 110 | 111 | WINDOW_WIDTH = field.width * 40 112 | WINDOW_HEIGHT = field.height * 30 113 | WINDOW = pygame.display.set_mode((WINDOW_WIDTH, WINDOW_HEIGHT)) 114 | 115 | clock = pygame.time.Clock() 116 | game_over = False 117 | 118 | while not game_over: 119 | clock.tick(27) 120 | env.render(WINDOW) 121 | 122 | for event in pygame.event.get(): 123 | if event.type == pygame.QUIT: 124 | game_over = True 125 | elif event.type == pygame.KEYDOWN: 126 | if event.key == pygame.K_LEFT: 127 | env.step(1) 128 | elif event.key == pygame.K_RIGHT: 129 | env.step(2) 130 | 131 | pygame.quit() 132 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Mohammed A. AL-Maamari 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Reinforcement Learning (RL) Using Python 2 | 3 | ![](https://cdn-images-1.medium.com/max/2560/1*a9F8vOTfpDEM52eW5SSXAQ.jpeg) 4 | 5 | Explanation of the game rules | The game played by a human 6 | :-------------------------:|:-------------------------: 7 | ![](images/gifs/envExp.gif) | ![](images/gifs/EnvPlayed.gif) 8 | 9 | In this tutorial series, we are going through every step of building an expert Reinforcement Learning (RL) agent that is capable of playing games. 10 | 11 | This series is divided into three parts: 12 | 13 | * **Part 1**: Designing and Building the Game Environment. In this part we will build a game environment and customize it to make the RL agent able to train on it. 14 | 15 | * **Part 2**: Build and Train the Deep Q Neural Network (DQN). In this part, we define and build the different layers of DQN and train it. 16 | 17 | * **Part 3**: Test and Play the Game. 18 | 19 | We might also try making another simple game environment and use Q-Learning to create an agent that can play this simple game. 20 | 21 | ## The Motivation: 22 | 23 | One time I was in the rabbit hole of YouTube and [THIS VIDEO](https://www.youtube.com/watch?v=k-rWB1jOt9s) was recommended to me, it was about the **sense of self **in human babies, after watching the video a similar question popped into my mind* “Can I develop a smart agent that is smart enough to have a sense of its body and has the ability to change its features to accomplish a certain task?”* 24 | 25 | This series is my way of answering this question. 26 | 27 | -------------------------------------------------------------------------------- /Reinforcement Learning (RL) Using Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Reinforcement Learning With Python | Part 1 | Creating The Environment" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Explanation of the game rules | The game played by a human\n", 15 | ":-------------------------:|:-------------------------:\n", 16 | "![](images/gifs/envExp.gif) | ![](images/gifs/EnvPlayed.gif)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "In this tutorial series, we are going through every step of building an expert Reinforcement Learning (RL) agent that is capable to play games.\n", 24 | "\n", 25 | "**This series is divided into three parts:**\n", 26 | "- **Part 1:** Designing and Building the Game Environment. In this part we will build a game environment and customize it to make the RL agent able to train on it.\n", 27 | "- **Part 2:** Build and Train the Deep Q Neural Network (DQN). In this part, we define and build the different layers of DQN and train it.\n", 28 | "- **Part 3:** Test and Play the Game.\n", 29 | "\n", 30 | "We might also try making another simple game environment and use Q-Learning to create an agent that can play this simple game." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Designing the Environment:\n", 38 | "\n", 39 | "For this environment, we want the agent to develop a sense of its body and how to change its body features to avoid losing the game." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "### First: The Elements of The Environment:\n", 47 | "The Elements of The Environment | .\n", 48 | ":-------------------------:|:-------------------------:\n", 49 | "" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "#### 1- The Field:\n", 57 | "Contains all the other elements,we represent it in code by class named \"Field\" as follows:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "class Field:\n", 67 | " def __init__(self, height=10, width=5):\n", 68 | " self.width = width\n", 69 | " self.height = height\n", 70 | " self.body = np.zeros(shape=(self.height, self.width))\n", 71 | " \n", 72 | " def update_field(self,walls, player):\n", 73 | " try:\n", 74 | " # Clear the field:\n", 75 | " self.body = np.zeros(shape=(self.height, self.width))\n", 76 | " # Put the walls on the field:\n", 77 | " for wall in walls:\n", 78 | " if not wall.out_of_range :\n", 79 | " self.body[wall.y:min(wall.y+wall.height,self.height),:] = wall.body\n", 80 | "\n", 81 | " # Put the player on the field:\n", 82 | " self.body[player.y:player.y+player.height,\n", 83 | " player.x:player.x+player.width] += player.body \n", 84 | " except :\n", 85 | " pass" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "**Field attributes:**\n", 93 | "\n", 94 | "* ***width (int)*** : the width of the field (not in pixels)\n", 95 | "\n", 96 | "* ***height (int)*** the height of the field (not in pixels)\n", 97 | "\n", 98 | "* ***body (np.array)*** : holds the array representation of the game elements (player and walls) \n", 99 | "\n", 100 | "This array is passed to the DQN, and also used to draw the interface using pygame.\n", 101 | "

\n", 102 | "**Field methods:**\n", 103 | "\n", 104 | "* ***update_field***(self,walls, player) : updates the field." 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "#### 2- The Walls:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "class Wall: \n", 121 | " def __init__(self, height = 5, width=100, hole_width = 20,\n", 122 | " y = 0, speed = 1, field = None):\n", 123 | " self.height = height\n", 124 | " self.width = width\n", 125 | " self.hole_width = hole_width\n", 126 | " self.y = y\n", 127 | " self.speed = speed\n", 128 | " self.field = field\n", 129 | " self.body_unit = 1\n", 130 | " self.body = np.ones(shape = (self.height, self.width))*self.body_unit\n", 131 | " self.out_of_range = False\n", 132 | " self.create_hole()\n", 133 | " def create_hole(self):\n", 134 | " hole = np.zeros(shape = (self.height, self.hole_width))\n", 135 | " hole_pos = randint(0,self.width-self.hole_width)\n", 136 | " self.body[ : , hole_pos:hole_pos+self.hole_width] = 0\n", 137 | " def move(self):\n", 138 | " self.y += self.speed\n", 139 | " self.out_of_range = True if ((self.y + self.height) > self.field.height) else False" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "The Wall | .\n", 147 | ":-------------------------:|:-------------------------:\n", 148 | "" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "**Wall attributes:**\n", 156 | "\n", 157 | "|Attribute |Type |Description |\n", 158 | "|------------|-----------|------------------------------------------------------------------------------------|\n", 159 | "|height |int |the wall's height |\n", 160 | "|width |int |the wall's width ( the same value as the field's width) |\n", 161 | "|hole_width |int |the hole's width (max value of hole_width should be field.width or wall.width) |\n", 162 | "|y |int |the vertical coordinate of the wall (y axis) (max value of y should be field.height)|\n", 163 | "|speed |int |speed of the wall (raw/step) |\n", 164 | "|field |Field |the field that contains the wall |\n", 165 | "|body_unit |int ; float|the number used to represent the wall in the array representation (in field.body) |\n", 166 | "|body |np.array |the wall's body |\n", 167 | "|out_of_range|bool |A flag used to delete the wall when it moves out of the field range. |" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "**Wall methods:**\n", 175 | "\n", 176 | "* ***create_hole***(self): Creates a hole in the wall that its width = self.hole_width.\n", 177 | "* ***move***(self): Moves the wall vertically (every time it get called the wall moves n steps from downward (n = self.speed))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "#### 3- The Player :" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "class Player:\n", 194 | " def __init__(self, height = 5, max_width = 10 , width=2,\n", 195 | " x = 0, y = 0, speed = 2):\n", 196 | " self.height = height\n", 197 | " self.max_width = max_width\n", 198 | " self.width = width\n", 199 | " self.x = x\n", 200 | " self.y = y\n", 201 | " self.speed = speed\n", 202 | " self.body_unit = 2\n", 203 | " self.body = np.ones(shape = (self.height, self.width))*self.body_unit\n", 204 | " self.stamina = 20\n", 205 | " self.max_stamina = 20\n", 206 | " def move(self, field, direction = 0 ):\n", 207 | " '''\n", 208 | " Moves the player :\n", 209 | " - No change = 0\n", 210 | " - left, if direction = 1\n", 211 | " - right, if direction = 2\n", 212 | " '''\n", 213 | " val2dir = {0:0 , 1:-1 , 2:1}\n", 214 | " direction = val2dir[direction]\n", 215 | " next_x = (self.x + self.speed*direction)\n", 216 | " if not (next_x + self.width > field.width or next_x < 0):\n", 217 | " self.x += self.speed*direction\n", 218 | " self.stamina -= 1 \n", 219 | " def change_width(self, action = 0):\n", 220 | " '''\n", 221 | " Change the player's width:\n", 222 | " - No change = 0\n", 223 | " - narrow by one unit = 3\n", 224 | " - widen by one unit = 4\n", 225 | " '''\n", 226 | " val2act = {0:0 , 3:-1 , 4:1}\n", 227 | " action = val2act[action]\n", 228 | " new_width = self.width+action\n", 229 | " player_end = self.x + new_width\n", 230 | " if new_width <= self.max_width and new_width > 0 and player_end <= self.max_width:\n", 231 | " self.width = new_width\n", 232 | " self.body = np.ones(shape = (self.height, self.width))*self.body_unit" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "**Player attributes:**\n", 240 | "\n", 241 | "|Attribute |Type |Description |\n", 242 | "|------------|-----------|------------------------------------------------------------------------------------|\n", 243 | "|height |int |player's height |\n", 244 | "|max_width |int |player's maximum width (must be less than field.width) |\n", 245 | "|width |int |player's width (must be less than or equal to max_width and begger than 0) |\n", 246 | "|x |int |player's x coordinate in the field |\n", 247 | "|y |int |player's y coordinate in the field |\n", 248 | "|speed |int |player's speed (how many horizontal units it moves per step) |\n", 249 | "|body_unit |int ; float|the number used to represent the player in the array representation (in field.body) |\n", 250 | "|body |np.array |the player's body |\n", 251 | "|stamina |int ; float|player's energy (stamina) (when a player's energy hits zero the player dies) |\n", 252 | "|max_stamina |int ; float|maximum value for player's stamina |" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "**Player methods:**\n", 260 | "* ***move***(self, field, direction = 0 ): Moves the player :\n", 261 | " - direction = 0 -> No change \n", 262 | " - direction = 1 -> left \n", 263 | " - direction = 2 -> right\n", 264 | "* ***change_width***(self, action = 0):\n", 265 | " - action = 0 -> No change\n", 266 | " - action = 3 -> narrow by one unit\n", 267 | " - action = 4 -> widen by one unit" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "---" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "## The \"Environment\" Class :\n", 282 | "This class facilitates the communication between the environment and the agent, it is designed to work either with an RL agent or with a human player." 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "### Main Components Needed by the RL Agent:\n", 290 | "- ***ENVIRONMENT_SHAPE*** attribute : used by the DQN to set the shape of the input layer.\n", 291 | "- ***ACTION_SPACE*** attribute : used by the DQN to set the shape of the output layer.\n", 292 | "- ***PUNISHMENT*** and ***REWARD*** : set the values of both punishment and reward, used to train the agent (we use these values to tell the agent if its previous actions were good or bad).\n", 293 | "- ***reset*** method : to reset the environment.\n", 294 | "- ***step*** method: takes an action as an argument and returns next state, reward, a boolean variable named game_over that is used to tell us if the game is over (the player lost) or not.\n", 295 | "\n", 296 | "It is clear that this environment is not different, it subsumes all the required components and more." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "class Environment:\n", 306 | " P_HEIGHT = 2 # Height of the player\n", 307 | " F_HEIGHT = 20 # Height of the field\n", 308 | " W_HEIGHT = 2 # Height of the walls\n", 309 | " WIDTH = 10 # Width of the field and the walls\n", 310 | " MIN_H_WIDTH = 2 # Minimum width of the holes\n", 311 | " MAX_H_WIDTH = 6 # Maximum width of the holes\n", 312 | " MIN_P_WIDTH = 2 # Minimum Width of the player\n", 313 | " MAX_P_WIDTH = 6 # Maximum Width of the player\n", 314 | " HEIGHT_MUL = 30 # Height Multiplier (used to draw np.array as blocks in pygame )\n", 315 | " WIDTH_MUL = 40 # Width Multiplier (used to draw np.array as blocks in pygame )\n", 316 | " WINDOW_HEIGHT = (F_HEIGHT+1) * HEIGHT_MUL # Height of the pygame window\n", 317 | " WINDOW_WIDTH = (WIDTH) * WIDTH_MUL # Widh of the pygame window\n", 318 | " \n", 319 | " ENVIRONMENT_SHAPE = (F_HEIGHT,WIDTH,1)\n", 320 | " ACTION_SPACE = [0,1,2,3,4]\n", 321 | " ACTION_SPACE_SIZE = len(ACTION_SPACE)\n", 322 | " PUNISHMENT = -100 # Punishment increment\n", 323 | " REWARD = 10 # Reward increment\n", 324 | " score = 0 # Initial Score\n", 325 | " \n", 326 | " MOVE_WALL_EVERY = 4 # Every how many frames the wall moves.\n", 327 | " MOVE_PLAYER_EVERY = 1 # Every how many frames the player moves.\n", 328 | " frames_counter = 0\n", 329 | "\n", 330 | " def __init__(self):\n", 331 | " # Colors:\n", 332 | " self.BLACK = (25,25,25)\n", 333 | " self.WHITE = (255,255,255)\n", 334 | " self.RED = (255, 80, 80)\n", 335 | " self.BLUE = (80, 80, 255)\n", 336 | " self.field = self.walls = self.player = None\n", 337 | " self.current_state = self.reset()\n", 338 | " self.val2color = {0:self.WHITE, self.walls[0].body_unit:self.BLACK,\n", 339 | " self.player.body_unit:self.BLACK, self.MAX_VAL:self.RED}\n", 340 | " def reset(self):\n", 341 | " self.score = 0\n", 342 | " self.frames_counter = 0\n", 343 | " self.game_over = False\n", 344 | " \n", 345 | " self.field = Field(height=self.F_HEIGHT, width=self.WIDTH )\n", 346 | " w1 = Wall( height = self.W_HEIGHT, width=self.WIDTH,\n", 347 | " hole_width = randint(self.MIN_H_WIDTH,self.MAX_H_WIDTH),\n", 348 | " field = self.field)\n", 349 | " self.walls = deque([w1])\n", 350 | " p_width = randint(self.MIN_P_WIDTH,self.MAX_P_WIDTH)\n", 351 | " self.player = Player( height = self.P_HEIGHT, max_width = self.WIDTH,\n", 352 | " width = p_width,\n", 353 | " x = randint(0,self.field.width-p_width),\n", 354 | " y = int(self.field.height*0.7), speed = 1)\n", 355 | " self.MAX_VAL = self.player.body_unit + w1.body_unit\n", 356 | " # Update the field :\n", 357 | " self.field.update_field(self.walls, self.player)\n", 358 | " \n", 359 | " observation = self.field.body/self.MAX_VAL\n", 360 | " return observation\n", 361 | " def print_text(self, WINDOW = None, text_cords = (0,0), center = False,\n", 362 | " text = \"\", color = (0,0,0), size = 32):\n", 363 | " pygame.init()\n", 364 | " font = pygame.font.Font('freesansbold.ttf', size) \n", 365 | " text_to_print = font.render(text, True, color) \n", 366 | " textRect = text_to_print.get_rect()\n", 367 | " if center:\n", 368 | " textRect.center = text_cords\n", 369 | " else:\n", 370 | " textRect.x = text_cords[0]\n", 371 | " textRect.y = text_cords[1]\n", 372 | " WINDOW.blit(text_to_print, textRect)\n", 373 | " \n", 374 | " def step(self, action):\n", 375 | " global score_increased\n", 376 | "\n", 377 | " self.frames_counter += 1\n", 378 | " reward = 0\n", 379 | "\n", 380 | " # If the performed action is (move) then player.move method is called:\n", 381 | " if action in [1,2]:\n", 382 | " self.player.move(direction = action, field = self.field)\n", 383 | " # If the performed action is (change_width) then player.change_width method is called:\n", 384 | " if action in [3,4]:\n", 385 | " self.player.change_width(action = action) \n", 386 | " \n", 387 | " # Move the wall one step (one step every WALL_SPEED frames):\n", 388 | " if self.frames_counter % self.WALL_SPEED == 0:\n", 389 | " # move the wall one step\n", 390 | " self.walls[-1].move()\n", 391 | " # reset the frames counter\n", 392 | " self.frames_counter = 0\n", 393 | " \n", 394 | " # Update the field :\n", 395 | " self.field.update_field(self.walls, self.player)\n", 396 | "\n", 397 | " # If the player passed a wall successfully increase the reward +1\n", 398 | " if ((self.walls[-1].y) == (self.player.y + self.player.height)) and not score_increased :\n", 399 | " reward += self.REWARD\n", 400 | " self.score += self.REWARD\n", 401 | " \n", 402 | " # Increase player's stamina every time it passed a wall successfully \n", 403 | " self.player.stamina = min(self.player.max_stamina, self.player.stamina+10)\n", 404 | " # score_increased : a flag to make sure that reward increases once per wall \n", 405 | " score_increased = True\n", 406 | " \n", 407 | " \n", 408 | " # Lose Conditions : \n", 409 | " # C1 : The player hits a wall\n", 410 | " # C2 : Player's width was far thinner than hole's width\n", 411 | " # C3 : Player fully consumed its stamina (energy)\n", 412 | " lose_conds = [self.MAX_VAL in self.field.body,\n", 413 | " ((self.player.y == self.walls[-1].y) and (self.player.width < (self.walls[-1].hole_width-1))),\n", 414 | " self.player.stamina <=0]\n", 415 | " \n", 416 | "\n", 417 | " # If one lose condition or more happend, the game ends:\n", 418 | " if True in lose_conds:\n", 419 | " self.game_over = True\n", 420 | " reward = self.PUNISHMENT\n", 421 | " return self.field.body/self.MAX_VAL, reward, self.game_over\n", 422 | "\n", 423 | " # Check if a wall moved out of the scene:\n", 424 | " if self.walls[-1].out_of_range:\n", 425 | " # Create a new wall\n", 426 | " self.walls[-1] = Wall( height = self.W_HEIGHT, width = self.WIDTH,\n", 427 | " hole_width = randint(self.MIN_H_WIDTH,self.MAX_H_WIDTH),\n", 428 | " field = self.field)\n", 429 | "\n", 430 | " score_increased = False\n", 431 | "\n", 432 | " \n", 433 | " # Return New Observation , reward, game_over(bool)\n", 434 | " return self.field.body/self.MAX_VAL, reward, self.game_over\n", 435 | " \n", 436 | " def render(self, WINDOW = None, human=False):\n", 437 | " if human:\n", 438 | " ################ Check Actions #####################\n", 439 | " action = 0\n", 440 | " events = pygame.event.get()\n", 441 | " for event in events:\n", 442 | " if event.type == pygame.QUIT:\n", 443 | " self.game_over = True\n", 444 | " if event.type == pygame.KEYDOWN:\n", 445 | " if event.key == pygame.K_LEFT:\n", 446 | " action = 1\n", 447 | " if event.key == pygame.K_RIGHT:\n", 448 | " action = 2\n", 449 | "\n", 450 | " if event.key == pygame.K_UP:\n", 451 | " action = 4\n", 452 | " if event.key == pygame.K_DOWN:\n", 453 | " action = 3\n", 454 | " ################## Step ############################ \n", 455 | " _,reward, self.game_over = self.step(action)\n", 456 | " ################ Draw Environment ###################\n", 457 | " WINDOW.fill(self.WHITE)\n", 458 | " self.field.update_field(self.walls, self.player)\n", 459 | " for r in range(self.field.body.shape[0]):\n", 460 | " for c in range(self.field.body.shape[1]):\n", 461 | " pygame.draw.rect(WINDOW,\n", 462 | " self.val2color[self.field.body[r][c]],\n", 463 | " (c*self.WIDTH_MUL, r*self.HEIGHT_MUL, self.WIDTH_MUL, self.HEIGHT_MUL))\n", 464 | "\n", 465 | " self.print_text(WINDOW = WINDOW, text_cords = (self.WINDOW_WIDTH // 2, int(self.WINDOW_HEIGHT*0.1)),\n", 466 | " text = str(self.score), color = self.RED, center = True)\n", 467 | " self.print_text(WINDOW = WINDOW, text_cords = (0, int(self.WINDOW_HEIGHT*0.9)),\n", 468 | " text = str(self.player.stamina), color = self.RED)\n", 469 | " \n", 470 | " pygame.display.update()" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "**Environment attributes:**\n", 478 | "\n", 479 | "|Attribute |Type |Description |\n", 480 | "|------------|-----------|------------------------------------------------------------------------------------|\n", 481 | "|P_HEIGHT |int |Height of the player |\n", 482 | "|F_HEIGHT |int |Height of the field |\n", 483 | "|W_HEIGHT |int |Height of the walls |\n", 484 | "|WIDTH |int |Width of the field and the walls |\n", 485 | "|MIN_H_WIDTH |int |Minimum width of the holes |\n", 486 | "|MAX_H_WIDTH |int |Maximum width of the holes |\n", 487 | "|MIN_P_WIDTH |int |Minimum Width of the player |\n", 488 | "|MAX_P_WIDTH |int |Maximum Width of the player |\n", 489 | "|HEIGHT_MUL |int |Height Multiplier (used to draw np.array as blocks in pygame ) |\n", 490 | "|WIDTH_MUL |int |Width Multiplier (used to draw np.array as blocks in pygame ) |\n", 491 | "|WINDOW_HEIGHT|int |Height of the pygame window |\n", 492 | "|WINDOW_WIDTH|int |Width of the pygame window |\n", 493 | "|ENVIRONMENT_SHAPE|tuple |(field height ; field width ; 1) |\n", 494 | "|ACTION_SPACE|list |list of actions an agent can perform |\n", 495 | "|ACTION_SPACE_SIZE|int |number of actions an agent can perform |\n", 496 | "|PUNISHMENT |int ; float|Punishment increment |\n", 497 | "|REWARD |int ; float|Reward increment |\n", 498 | "|score |int ; float|Initial Score |\n", 499 | "|MOVE_WALL_EVERY|int |Every how many frames the wall moves. |\n", 500 | "|MOVE_PLAYER_EVERY|int |Every how many frames the player moves. |\n", 501 | "|frames_counter|int |used to handle the wall speed |\n", 502 | "|field |Field | the field object that holds walls and players |\n", 503 | "|walls |double ended queue of Wall objects|a que of walls |\n", 504 | "|player |Player |the player object |\n", 505 | "|current_state|np.array |holds the current state of the field (the array representation of the game field) |\n", 506 | "|val2color |dictionary |used to color the blocks depending on their values (ex: if you want to color the player RED you will put 'self.player.body_unit:RED' in val2color dictionary)|\n", 507 | "|MAX_VAL |int ; float| used to detect collisions between walls and players (MAX_VAL = self.player.body_unit + self.wall.body_unit) |\n" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "***Environment methods:***\n", 515 | "* \\__***init***__(self) : initializes the environment by initializing some attributes and calling the reset method.\n", 516 | "* ***reset***(self) : resets the environment and returns the state of the game field after resetting it.\n", 517 | "* ***print_text***(self, WINDOW = None, text_cords = (0,0), center = False, text = \"\", color = (0,0,0), size = 32): prints a text in a given pygame.display (WINDOW) with the given features.\n", 518 | "---\n", 519 | "**+ step(self, action):**\n", 520 | "\n", 521 | "1. Call the player's move method to move the player.\n", 522 | "2. Call the player's change_width method to move the player.\n", 523 | "3. Move the wall one step.\n", 524 | "4. Update the field.\n", 525 | "5. Check if the player passed a wall successfully. If so, gives the player a reward and increase its stamina.\n", 526 | "6. Check the three losing conditions: the player loses the game if at least one of these three conditions met.\n", 527 | "\n", 528 | "**Losing Conditions**:\n", 529 | "\n", 530 | "|Condition |Explanation|Code |\n", 531 | "|------------|-----------|------------------------------------------------------------------------------------|\n", 532 | "|C1 |The player hits a wall|self.MAX_VAL in self.field.body |\n", 533 | "|C2 |Player's width was far thinner than hole's width|((self.player.y == self.walls[-1].y) and (self.player.width < (self.walls[-1].hole_width-1)))|\n", 534 | "|C3 |Player fully consumed its stamina (energy)|self.player.stamina <=0 |\n", 535 | "\n", 536 | "\n", 537 | "when a player loses, the value of returned reward will equal PUNISHMENT, and the indicator of the game state (game_over) changes from false to true.\n", 538 | "\n", 539 | "7. Check if the current wall hits the bottom of the field, when that happens, the out of range wall is replaced by a new wall.\n", 540 | "8. Return next_state normalized, reward, game_over\n", 541 | "---\n", 542 | "**+render**(self, WINDOW = None, human=False):\n", 543 | "\n", 544 | "**Arguments:**\n", 545 | "* ***WINDOW*** (pygame.display): the pygame.display that the game will be rendered on.\n", 546 | "* ***human*** (bool): If a human will play the game, this argument is set to True, in this case pygame catch pressed keyboard keys to get the action that will be performed.\n", 547 | "\n", 548 | "**Explanation of render method line by line:**\n", 549 | "1. Check if the player is a human. If so, get the pressed key and translate it to the corresponding action (ex: if the right arrow is pressed then set action = 2, that means move the player on step to the right), then call step method to perform the chosen action.\n", 550 | "2. Update the field then start drawing the walls and the player as blocks.\n", 551 | "3. Print the score and the player's stamina.\n", 552 | "4. Finally, update the display to show the rendered screen." 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": {}, 558 | "source": [ 559 | "## Finally : Put it all together\n", 560 | "Now we are going to use everything we explained and play the game:\n", 561 | "\n", 562 | "The following code repeats the game until the player wins by getting a score higher than or equals winning_score, or quits the game." 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "# Make an environment object\n", 572 | "env = Environment()\n", 573 | "# Change wall speed to 3 (one step every 3 frames)\n", 574 | "env.WALL_SPEED = 3\n", 575 | "\n", 576 | "# Initialize some variables \n", 577 | "WINDOW = pygame.display.set_mode((env.WINDOW_WIDTH, env.WINDOW_HEIGHT))\n", 578 | "clock = pygame.time.Clock()\n", 579 | "win = False\n", 580 | "winning_score = 100\n", 581 | "\n", 582 | "# Repeaat the game untill the player win (got a score of winning_score) or quits the game.\n", 583 | "while not win:\n", 584 | " score_increased = False\n", 585 | " game_over = False\n", 586 | " _ = env.reset()\n", 587 | " pygame.display.set_caption(\"Game\")\n", 588 | " while not game_over:\n", 589 | " clock.tick(27)\n", 590 | " env.render(WINDOW = WINDOW, human=True)\n", 591 | " game_over = env.game_over\n", 592 | " #####################################################\n", 593 | " sleep(0.5)\n", 594 | " WINDOW.fill(env.WHITE)\n", 595 | " if env.score >= winning_score:\n", 596 | " win = True\n", 597 | " env.print_text(WINDOW = WINDOW, text_cords = (env.WINDOW_WIDTH // 2, env.WINDOW_HEIGHT// 2),\n", 598 | " text = f\"You Win - Score : {env.score}\", color = env.RED, center = True)\n", 599 | " else:\n", 600 | " env.print_text(WINDOW = WINDOW, text_cords = (env.WINDOW_WIDTH // 2, env.WINDOW_HEIGHT// 2),\n", 601 | " text = f\"Game Over - Score : {env.score}\", color = env.RED, center = True)\n", 602 | " pygame.display.update()" 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": {}, 608 | "source": [ 609 | "You can get the full code [HERE](https://github.com/ModMaamari/reinforcement-learning-using-python)" 610 | ] 611 | } 612 | ], 613 | "metadata": { 614 | "kernelspec": { 615 | "display_name": "Python 3", 616 | "language": "python", 617 | "name": "python3" 618 | }, 619 | "language_info": { 620 | "codemirror_mode": { 621 | "name": "ipython", 622 | "version": 3 623 | }, 624 | "file_extension": ".py", 625 | "mimetype": "text/x-python", 626 | "name": "python", 627 | "nbconvert_exporter": "python", 628 | "pygments_lexer": "ipython3", 629 | "version": "3.7.5" 630 | } 631 | }, 632 | "nbformat": 4, 633 | "nbformat_minor": 2 634 | } 635 | -------------------------------------------------------------------------------- /Train_GridSearch.py: -------------------------------------------------------------------------------- 1 | from random import randint, choice 2 | from collections import deque 3 | from time import sleep 4 | import pygame, time 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten 9 | from keras.layers import Input, BatchNormalization, GlobalMaxPooling2D 10 | from keras.callbacks import TensorBoard, ModelCheckpoint 11 | import keras.backend.tensorflow_backend as backend 12 | from keras.models import Sequential, Model 13 | from keras.models import load_model 14 | from keras.optimizers import Adam 15 | import tensorflow as tf 16 | from tqdm import tqdm 17 | import random 18 | import os 19 | 20 | # For more repetitive results 21 | random.seed(1) 22 | np.random.seed(1) 23 | tf.random.set_seed(1) 24 | 25 | PATH = "" 26 | # Create models folder 27 | if not os.path.isdir(f'{PATH}models'): 28 | os.makedirs(f'{PATH}models') 29 | # Create results folder 30 | if not os.path.isdir(f'{PATH}results'): 31 | os.makedirs(f'{PATH}results') 32 | 33 | pygame.init() 34 | 35 | TstartTime = time.time() 36 | 37 | 38 | ###################################################################################### 39 | class Field: 40 | def __init__(self, height=10, width=5): 41 | self.width = width 42 | self.height = height 43 | self.body = np.zeros(shape=(self.height, self.width)) 44 | def update_field(self,walls, player): 45 | try: 46 | # Clear the field: 47 | self.body = np.zeros(shape=(self.height, self.width)) 48 | # Put the walls on the field: 49 | for wall in walls: 50 | if not wall.out_of_range : 51 | self.body[wall.y:min(wall.y+wall.height,self.height),:] = wall.body 52 | 53 | # Put the player on the field: 54 | self.body[player.y:player.y+player.height, 55 | player.x:player.x+player.width] += player.body 56 | except : 57 | pass 58 | ###################################################################################### 59 | class Wall: 60 | def __init__(self, height = 5, width=100, hole_width = 20, 61 | y = 0, speed = 1, field = None): 62 | self.height = height 63 | self.width = width 64 | self.hole_width = hole_width 65 | self.y = y 66 | self.speed = speed 67 | self.field = field 68 | self.body_unit = 1 69 | self.body = np.ones(shape = (self.height, self.width))*self.body_unit 70 | self.out_of_range = False 71 | self.create_hole() 72 | def create_hole(self): 73 | hole = np.zeros(shape = (self.height, self.hole_width)) 74 | hole_pos = randint(0,self.width-self.hole_width) 75 | self.body[ : , hole_pos:hole_pos+self.hole_width] = 0 76 | def move(self): 77 | self.y += self.speed 78 | self.out_of_range = True if ((self.y + self.height) > self.field.height) else False 79 | ###################################################################################### 80 | class Player: 81 | def __init__(self, height = 5, max_width = 10 , width=2, 82 | x = 0, y = 0, speed = 2): 83 | self.height = height 84 | self.max_width = max_width 85 | self.width = width 86 | self.x = x 87 | self.y = y 88 | self.speed = speed 89 | self.body_unit = 2 90 | self.body = np.ones(shape = (self.height, self.width))*self.body_unit 91 | self.stamina = 20 92 | self.max_stamina = 20 93 | def move(self, field, direction = 0 ): 94 | ''' 95 | Moves the player : 96 | - No change = 0 97 | - left, if direction = 1 98 | - right, if direction = 2 99 | ''' 100 | val2dir = {0:0 , 1:-1 , 2:1} 101 | direction = val2dir[direction] 102 | next_x = (self.x + self.speed*direction) 103 | if not (next_x + self.width > field.width or next_x < 0): 104 | self.x += self.speed*direction 105 | self.stamina -= 1 106 | def change_width(self, action = 0): 107 | ''' 108 | Change the player's width: 109 | - No change = 0 110 | - narrow by one unit = 3 111 | - widen by one unit = 4 112 | ''' 113 | val2act = {0:0 , 3:-1 , 4:1} 114 | action = val2act[action] 115 | new_width = self.width+action 116 | player_end = self.x + new_width 117 | if new_width <= self.max_width and new_width > 0 and player_end <= self.max_width: 118 | self.width = new_width 119 | self.body = np.ones(shape = (self.height, self.width))*self.body_unit 120 | ###################################################################################### 121 | class Environment: 122 | P_HEIGHT = 2 # Height of the player 123 | F_HEIGHT = 20 # Height of the field 124 | W_HEIGHT = 2 # Height of the walls 125 | WIDTH = 10 # Width of the field and the walls 126 | MIN_H_WIDTH = 2 # Minimum width of the holes 127 | MAX_H_WIDTH = 6 # Maximum width of the holes 128 | MIN_P_WIDTH = 2 # Minimum Width of the player 129 | MAX_P_WIDTH = 6 # Maximum Width of the player 130 | HEIGHT_MUL = 30 # Height Multiplier (used to draw np.array as blocks in pygame ) 131 | WIDTH_MUL = 40 # Width Multiplier (used to draw np.array as blocks in pygame ) 132 | WINDOW_HEIGHT = (F_HEIGHT+1) * HEIGHT_MUL # Height of the pygame window 133 | WINDOW_WIDTH = (WIDTH) * WIDTH_MUL # Widh of the pygame window 134 | 135 | ENVIRONMENT_SHAPE = (F_HEIGHT,WIDTH,1) 136 | ACTION_SPACE = [0,1,2,3,4] 137 | ACTION_SPACE_SIZE = len(ACTION_SPACE) 138 | PUNISHMENT = -100 # Punishment increment 139 | REWARD = 10 # Reward increment 140 | score = 0 # Initial Score 141 | 142 | MOVE_WALL_EVERY = 4 # Every how many frames the wall moves. 143 | MOVE_PLAYER_EVERY = 1 # Every how many frames the player moves. 144 | frames_counter = 0 145 | 146 | def __init__(self): 147 | # Colors: 148 | self.BLACK = (25,25,25) 149 | self.WHITE = (255,255,255) 150 | self.RED = (255, 80, 80) 151 | self.BLUE = (80, 80, 255) 152 | self.field = self.walls = self.player = None 153 | self.current_state = self.reset() 154 | self.val2color = {0:self.WHITE, self.walls[0].body_unit:self.BLACK, 155 | self.player.body_unit:self.BLACK, self.MAX_VAL:self.RED} 156 | def reset(self): 157 | self.score = 0 158 | self.frames_counter = 0 159 | self.game_over = False 160 | 161 | self.field = Field(height=self.F_HEIGHT, width=self.WIDTH ) 162 | w1 = Wall( height = self.W_HEIGHT, width=self.WIDTH, 163 | hole_width = randint(self.MIN_H_WIDTH,self.MAX_H_WIDTH), 164 | field = self.field) 165 | self.walls = deque([w1]) 166 | p_width = randint(self.MIN_P_WIDTH,self.MAX_P_WIDTH) 167 | self.player = Player( height = self.P_HEIGHT, max_width = self.WIDTH, 168 | width = p_width, 169 | x = randint(0,self.field.width-p_width), 170 | y = int(self.field.height*0.7), speed = 1) 171 | self.MAX_VAL = self.player.body_unit + w1.body_unit 172 | # Update the field : 173 | self.field.update_field(self.walls, self.player) 174 | 175 | observation = self.field.body/self.MAX_VAL 176 | return observation 177 | def print_text(self, WINDOW = None, text_cords = (0,0), center = False, 178 | text = "", color = (0,0,0), size = 32): 179 | pygame.init() 180 | font = pygame.font.Font('freesansbold.ttf', size) 181 | text_to_print = font.render(text, True, color) 182 | textRect = text_to_print.get_rect() 183 | if center: 184 | textRect.center = text_cords 185 | else: 186 | textRect.x = text_cords[0] 187 | textRect.y = text_cords[1] 188 | WINDOW.blit(text_to_print, textRect) 189 | 190 | def step(self, action): 191 | global score_increased 192 | 193 | self.frames_counter += 1 194 | reward = 0 195 | 196 | # If the performed action is (move) then player.move method is called: 197 | if action in [1,2]: 198 | self.player.move(direction = action, field = self.field) 199 | # If the performed action is (change_width) then player.change_width method is called: 200 | if action in [3,4]: 201 | self.player.change_width(action = action) 202 | 203 | # Move the wall one step (one step every MOVE_WALL_EVERY frames): 204 | if self.frames_counter % self.MOVE_WALL_EVERY == 0: 205 | # move the wall one step 206 | self.walls[-1].move() 207 | # reset the frames counter 208 | self.frames_counter = 0 209 | 210 | # Update the field : 211 | self.field.update_field(self.walls, self.player) 212 | 213 | # If the player passed a wall successfully increase the reward +1 214 | if ((self.walls[-1].y) == (self.player.y + self.player.height)) and not score_increased : 215 | reward += self.REWARD 216 | self.score += self.REWARD 217 | 218 | # Increase player's stamina every time it passed a wall successfully 219 | self.player.stamina = min(self.player.max_stamina, self.player.stamina+10) 220 | # score_increased : a flag to make sure that reward increases once per wall 221 | score_increased = True 222 | 223 | 224 | # Lose Conditions : 225 | # C1 : The player hits a wall 226 | # C2 : Player's width was far thinner than hole's width 227 | # C3 : Player fully consumed its stamina (energy) 228 | lose_conds = [self.MAX_VAL in self.field.body, 229 | ((self.player.y == self.walls[-1].y) and (self.player.width < (self.walls[-1].hole_width-1))), 230 | self.player.stamina <=0] 231 | 232 | 233 | # If one lose condition or more happend, the game ends: 234 | if True in lose_conds: 235 | self.game_over = True 236 | reward = self.PUNISHMENT 237 | return self.field.body/self.MAX_VAL, reward, self.game_over 238 | 239 | # Check if a wall moved out of the scene: 240 | if self.walls[-1].out_of_range: 241 | # Create a new wall 242 | self.walls[-1] = Wall( height = self.W_HEIGHT, width = self.WIDTH, 243 | hole_width = randint(self.MIN_H_WIDTH,self.MAX_H_WIDTH), 244 | field = self.field) 245 | 246 | score_increased = False 247 | 248 | 249 | # Return New Observation , reward, game_over(bool) 250 | return self.field.body/self.MAX_VAL, reward, self.game_over 251 | 252 | def render(self, WINDOW = None, human=False): 253 | if human: 254 | ################ Check Actions ##################### 255 | action = 0 256 | events = pygame.event.get() 257 | for event in events: 258 | if event.type == pygame.QUIT: 259 | self.game_over = True 260 | if event.type == pygame.KEYDOWN: 261 | if event.key == pygame.K_LEFT: 262 | action = 1 263 | if event.key == pygame.K_RIGHT: 264 | action = 2 265 | 266 | if event.key == pygame.K_UP: 267 | action = 4 268 | if event.key == pygame.K_DOWN: 269 | action = 3 270 | ################## Step ############################ 271 | _,reward, self.game_over = self.step(action) 272 | ################ Draw Environment ################### 273 | WINDOW.fill(self.WHITE) 274 | self.field.update_field(self.walls, self.player) 275 | for r in range(self.field.body.shape[0]): 276 | for c in range(self.field.body.shape[1]): 277 | pygame.draw.rect(WINDOW, 278 | self.val2color[self.field.body[r][c]], 279 | (c*self.WIDTH_MUL, r*self.HEIGHT_MUL, self.WIDTH_MUL, self.HEIGHT_MUL)) 280 | 281 | self.print_text(WINDOW = WINDOW, text_cords = (self.WINDOW_WIDTH // 2, int(self.WINDOW_HEIGHT*0.1)), 282 | text = str(self.score), color = self.RED, center = True) 283 | self.print_text(WINDOW = WINDOW, text_cords = (0, int(self.WINDOW_HEIGHT*0.9)), 284 | text = str(self.player.stamina), color = self.RED) 285 | 286 | pygame.display.update() 287 | ###################################################################################### 288 | class ModifiedTensorBoard(TensorBoard): 289 | # Overriding init to set initial step and writer (we want one log file for all .fit() calls) 290 | def __init__(self, name, **kwargs): 291 | super().__init__(**kwargs) 292 | self.step = 1 293 | self.writer = tf.summary.create_file_writer(self.log_dir) 294 | self._log_write_dir = os.path.join(self.log_dir, name) 295 | 296 | # Overriding this method to stop creating default log writer 297 | def set_model(self, model): 298 | pass 299 | 300 | # Overrided, saves logs with our step number 301 | # (otherwise every .fit() will start writing from 0th step) 302 | def on_epoch_end(self, epoch, logs=None): 303 | self.update_stats(**logs) 304 | 305 | # Overrided 306 | # We train for one batch only, no need to save anything at epoch end 307 | def on_batch_end(self, batch, logs=None): 308 | pass 309 | 310 | # Overrided, so won't close writer 311 | def on_train_end(self, _): 312 | pass 313 | 314 | def on_train_batch_end(self, batch, logs=None): 315 | pass 316 | 317 | # Custom method for saving own metrics 318 | # Creates writer, writes custom metrics and closes writer 319 | def update_stats(self, **stats): 320 | self._write_logs(stats, self.step) 321 | 322 | def _write_logs(self, logs, index): 323 | with self.writer.as_default(): 324 | for name, value in logs.items(): 325 | tf.summary.scalar(name, value, step=index) 326 | self.step += 1 327 | self.writer.flush() 328 | ###################################################################################### 329 | # Agent class 330 | class DQNAgent: 331 | def __init__(self, name, env, conv_list, dense_list, util_list): 332 | self.env = env 333 | self.conv_list = conv_list 334 | self.dense_list = dense_list 335 | self.name = [str(name) +" | " + "".join(str(c)+"C | " for c in conv_list) + "".join(str(d) + "D | " for d in dense_list) + "".join(u + " | " for u in util_list) ][0] 336 | 337 | # Main model 338 | self.model = self.create_model(self.conv_list, self.dense_list) 339 | 340 | # Target network 341 | self.target_model = self.create_model(self.conv_list, self.dense_list) 342 | self.target_model.set_weights(self.model.get_weights()) 343 | 344 | # An array with last n steps for training 345 | self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) 346 | 347 | # Custom tensorboard object 348 | self.tensorboard = ModifiedTensorBoard(name, log_dir="{}logs/{}-{}".format(PATH, name, int(time.time()))) 349 | 350 | # Used to count when to update target network with main network's weights 351 | self.target_update_counter = 0 352 | 353 | 354 | # Creates a convolutional block given (filters) number of filters, (dropout) dropout rate, 355 | # (bn) a boolean variable indecating the use of BatchNormalization, 356 | # (pool) a boolean variable indecating the use of MaxPooling2D 357 | def conv_block(self, inp, filters=64, bn=True, pool=True, dropout = 0.2): 358 | _ = Conv2D(filters=filters, kernel_size=3, activation='relu')(inp) 359 | if bn: 360 | _ = BatchNormalization()(_) 361 | if pool: 362 | _ = MaxPooling2D(pool_size=(2, 2))(_) 363 | if dropout > 0: 364 | _ = Dropout(0.2)(_) 365 | return _ 366 | # Creates the model with the given specifications: 367 | def create_model(self, conv_list, dense_list): 368 | # Defines the input layer with shape = ENVIRONMENT_SHAPE 369 | input_layer = Input(shape=self.env.ENVIRONMENT_SHAPE) 370 | # Defines the first convolutional block: 371 | _ = self.conv_block(input_layer, filters=conv_list[0], bn=False, pool=False) 372 | # If number of convolutional layers is 2 or more, use a loop to create them. 373 | if len(conv_list)>1: 374 | for c in conv_list[1:]: 375 | _ = self.conv_block(_, filters=c) 376 | # Flatten the output of the last convolutional layer. 377 | _ = Flatten()(_) 378 | 379 | # Creating the dense layers: 380 | for d in dense_list: 381 | _ = Dense(units=d, activation='relu')(_) 382 | # The output layer has 5 nodes (one node per action) 383 | output = Dense(units=self.env.ACTION_SPACE_SIZE, 384 | activation='linear', name='output')(_) 385 | 386 | # Put it all together: 387 | model = Model(inputs=input_layer, outputs=[output]) 388 | model.compile(optimizer=Adam(lr=0.001), 389 | loss={'output': 'mse'}, 390 | metrics={'output': 'accuracy'}) 391 | 392 | return model 393 | 394 | # Adds step's data to a memory replay array 395 | # (observation space, action, reward, new observation space, done) 396 | def update_replay_memory(self, transition): 397 | self.replay_memory.append(transition) 398 | 399 | # Trains main network every step during episode 400 | def train(self, terminal_state, step): 401 | # Start training only if certain number of samples is already saved 402 | if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: 403 | return 404 | 405 | # Get a minibatch of random samples from memory replay table 406 | minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE) 407 | 408 | # Get current states from minibatch, then query NN model for Q values 409 | current_states = np.array([transition[0] for transition in minibatch]) 410 | current_qs_list = self.model.predict(current_states.reshape(-1, *env.ENVIRONMENT_SHAPE)) 411 | 412 | 413 | # Get future states from minibatch, then query NN model for Q values 414 | # When using target network, query it, otherwise main network should be queried 415 | new_current_states = np.array([transition[3] for transition in minibatch]) 416 | future_qs_list = self.target_model.predict(new_current_states.reshape(-1, *env.ENVIRONMENT_SHAPE)) 417 | 418 | X = [] 419 | y = [] 420 | 421 | # Now we need to enumerate our batches 422 | for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch): 423 | 424 | # If not a terminal state, get new q from future states, otherwise set it to 0 425 | # almost like with Q Learning, but we use just part of equation here 426 | if not done: 427 | max_future_q = np.max(future_qs_list[index]) 428 | new_q = reward + DISCOUNT * max_future_q 429 | else: 430 | new_q = reward 431 | 432 | # Update Q value for given state 433 | current_qs = current_qs_list[index] 434 | current_qs[action] = new_q 435 | 436 | # And append to our training data 437 | X.append(current_state) 438 | y.append(current_qs) 439 | 440 | 441 | # Fit on all samples as one batch, log only on terminal state 442 | self.model.fit(x = np.array(X).reshape(-1, *env.ENVIRONMENT_SHAPE), 443 | y = np.array(y), 444 | batch_size = MINIBATCH_SIZE, verbose = 0, 445 | shuffle=False, callbacks=[self.tensorboard] if terminal_state else None) 446 | 447 | # Update target network counter every episode 448 | if terminal_state: 449 | self.target_update_counter += 1 450 | 451 | # If counter reaches set value, update target network with weights of main network 452 | if self.target_update_counter > UPDATE_TARGET_EVERY: 453 | self.target_model.set_weights(self.model.get_weights()) 454 | self.target_update_counter = 0 455 | 456 | # Queries main network for Q values given current observation space (environment state) 457 | def get_qs(self, state): 458 | return self.model.predict(state.reshape(-1, *env.ENVIRONMENT_SHAPE)) 459 | ###################################################################################### 460 | def save_model_and_weights(agent, model_name, episode, max_reward, average_reward, min_reward): 461 | checkpoint_name = f"{model_name}| Eps({episode}) | max({max_reward:_>7.2f}) | avg({average_reward:_>7.2f}) | min({min_reward:_>7.2f}).model" 462 | agent.model.save(f'{PATH}models/{checkpoint_name}') 463 | best_weights = agent.model.get_weights() 464 | return best_weights 465 | ###################################################################################### 466 | # ## Constants: 467 | # RL Constants: 468 | DISCOUNT = 0.99 469 | REPLAY_MEMORY_SIZE = 3_000 # How many last steps to keep for model training 470 | MIN_REPLAY_MEMORY_SIZE = 1_000 # Minimum number of steps in a memory to start training 471 | UPDATE_TARGET_EVERY = 20 # Terminal states (end of episodes) 472 | MIN_REWARD = 1000 # For model save 473 | SAVE_MODEL_EVERY = 1000 # Episodes 474 | SHOW_EVERY = 20 # Episodes 475 | EPISODES = 100 # Number of episodes 476 | # Stats settings 477 | AGGREGATE_STATS_EVERY = 20 # episodes 478 | SHOW_PREVIEW = False 479 | ###################################################################################### 480 | # Models Arch : 481 | # [{[conv_list], [dense_list], [util_list], MINIBATCH_SIZE, {EF_Settings}, {ECC_Settings}} ] 482 | 483 | models_arch = [ {"conv_list":[32], "dense_list":[32,32], "util_list":["ECC2", "1A-5Ac"], 484 | "MINIBATCH_SIZE":128, "best_only":False, 485 | "EF_Settings":{"EF_Enabled":False}, "ECC_Settings":{"ECC_Enabled":False}}, 486 | 487 | {"conv_list":[32], "dense_list":[32,32,32], "util_list":["ECC2", "1A-5Ac"], 488 | "MINIBATCH_SIZE":128, "best_only":False, 489 | "EF_Settings":{"EF_Enabled":False}, "ECC_Settings":{"ECC_Enabled":False}}, 490 | 491 | {"conv_list":[32], "dense_list":[32,32], "util_list":["ECC2", "1A-5Ac"], 492 | "MINIBATCH_SIZE":128, "best_only":False, 493 | "EF_Settings":{"EF_Enabled":True, "FLUCTUATIONS":2}, 494 | "ECC_Settings":{"ECC_Enabled":True, "MAX_EPS_NO_INC":int(EPISODES*0.2)}}] 495 | 496 | # A dataframe used to store grid search results 497 | res = pd.DataFrame(columns = ["Model Name","Convolution Layers", "Dense Layers", "Batch Size", "ECC", "EF", 498 | "Best Only" , "Average Reward", "Best Average", "Epsilon 4 Best Average", 499 | "Best Average On", "Max Reward", "Epsilon 4 Max Reward", "Max Reward On", 500 | "Total Training Time (min)", "Time Per Episode (sec)"]) 501 | ###################################################################################### 502 | # Grid Search: 503 | for i, m in enumerate(models_arch): 504 | startTime = time.time() # Used to count episode training time 505 | MINIBATCH_SIZE = m["MINIBATCH_SIZE"] 506 | 507 | # Exploration settings : 508 | # Epsilon Fluctuation (EF): 509 | EF_Enabled = m["EF_Settings"]["EF_Enabled"] # Enable Epsilon Fluctuation 510 | MAX_EPSILON = 1 # Maximum epsilon value 511 | MIN_EPSILON = 0.001 # Minimum epsilon value 512 | if EF_Enabled: 513 | FLUCTUATIONS = m["EF_Settings"]["FLUCTUATIONS"] # How many times epsilon will fluctuate 514 | FLUCTUATE_EVERY = int(EPISODES/FLUCTUATIONS) # Episodes 515 | EPSILON_DECAY = MAX_EPSILON - (MAX_EPSILON/FLUCTUATE_EVERY) 516 | epsilon = 1 # not a constant, going to be decayed 517 | else: 518 | EPSILON_DECAY = MAX_EPSILON - (MAX_EPSILON/(0.8*EPISODES)) 519 | epsilon = 1 # not a constant, going to be decayed 520 | 521 | # Initialize some variables: 522 | best_average = -100 523 | best_score = -100 524 | 525 | # Epsilon Conditional Constantation (ECC): 526 | ECC_Enabled = m["ECC_Settings"]["ECC_Enabled"] 527 | avg_reward_info = [[1, best_average, epsilon]] # [[episode1, reward1 , epsilon1] ... [episode_n, reward_n , epsilon_n]] 528 | max_reward_info = [[1, best_score , epsilon]] 529 | if ECC_Enabled : MAX_EPS_NO_INC = m["ECC_Settings"]["MAX_EPS_NO_INC"] # Maximum number of episodes without any increment in reward average 530 | eps_no_inc_counter = 0 # Counts episodes with no increment in reward 531 | 532 | 533 | # For stats 534 | ep_rewards = [best_average] 535 | 536 | 537 | 538 | env = Environment() 539 | env.MOVE_WALL_EVERY = 1 # Every how many frames the wall moves. 540 | 541 | 542 | agent = DQNAgent(f"M{i}", env, m["conv_list"], m["dense_list"], m["util_list"]) 543 | MODEL_NAME = agent.name 544 | 545 | 546 | best_weights = [agent.model.get_weights()] 547 | 548 | # Uncomment these two lines if you want to show preview on your screen 549 | # WINDOW = pygame.display.set_mode((env.WINDOW_WIDTH, env.WINDOW_HEIGHT)) 550 | # clock = pygame.time.Clock() 551 | 552 | # Iterate over episodes 553 | for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'): 554 | if m["best_only"]: agent.model.set_weights(best_weights[0]) 555 | # agent.target_model.set_weights(best_weights[0]) 556 | 557 | score_increased = False 558 | # Update tensorboard step every episode 559 | agent.tensorboard.step = episode 560 | 561 | # Restarting episode - reset episode reward and step number 562 | episode_reward = 0 563 | step = 1 564 | action = 0 565 | # Reset environment and get initial state 566 | current_state = env.reset() 567 | game_over = env.game_over 568 | while not game_over: 569 | # This part stays mostly the same, the change is to query a model for Q values 570 | if np.random.random() > epsilon: 571 | # Get action from Q table 572 | action = np.argmax(agent.get_qs(current_state)) 573 | 574 | else: 575 | # Get random action 576 | action = choice(env.ACTION_SPACE) 577 | 578 | new_state, reward, game_over = env.step(action) 579 | 580 | # Transform new continuous state to new discrete state and count reward 581 | episode_reward += reward 582 | 583 | # Uncomment the next block if you want to show preview on your screen 584 | # if SHOW_PREVIEW and not episode % SHOW_EVERY: 585 | # clock.tick(27) 586 | # env.render(WINDOW) 587 | 588 | # Every step we update replay memory and train main network 589 | agent.update_replay_memory((current_state, action, reward, new_state, game_over)) 590 | agent.train(game_over, step) 591 | 592 | current_state = new_state 593 | step += 1 594 | 595 | if ECC_Enabled : eps_no_inc_counter += 1 596 | # Append episode reward to a list and log stats (every given number of episodes) 597 | ep_rewards.append(episode_reward) 598 | 599 | if not episode % AGGREGATE_STATS_EVERY: 600 | average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:]) 601 | min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:]) 602 | max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:]) 603 | agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon) 604 | 605 | # Save models, but only when avg reward is greater or equal a set value 606 | if not episode % SAVE_MODEL_EVERY: 607 | # Save Agent : 608 | _ = save_model_and_weights(agent, MODEL_NAME, episode, max_reward, average_reward, min_reward) 609 | 610 | 611 | if average_reward > best_average: 612 | best_average = average_reward 613 | # update ECC variables: 614 | avg_reward_info.append([episode, best_average, epsilon]) 615 | eps_no_inc_counter = 0 616 | # Save Agent : 617 | best_weights[0] = save_model_and_weights(agent, MODEL_NAME, episode, max_reward, average_reward, min_reward) 618 | 619 | if ECC_Enabled and eps_no_inc_counter >= MAX_EPS_NO_INC: 620 | epsilon = avg_reward_info[-1][2] # Get epsilon value of the last best reward 621 | eps_no_inc_counter = 0 622 | 623 | if episode_reward > best_score: 624 | try: 625 | best_score = episode_reward 626 | max_reward_info.append([episode, best_score, epsilon]) 627 | 628 | # Save Agent : 629 | best_weights[0] = save_model_and_weights(agent, MODEL_NAME, episode, max_reward, average_reward, min_reward) 630 | 631 | except: 632 | pass 633 | 634 | # Decay epsilon 635 | if epsilon > MIN_EPSILON: 636 | epsilon *= EPSILON_DECAY 637 | epsilon = max(MIN_EPSILON, epsilon) 638 | 639 | # Epsilon Fluctuation: 640 | if EF_Enabled: 641 | if not episode % FLUCTUATE_EVERY: 642 | epsilon = MAX_EPSILON 643 | 644 | endTime = time.time() 645 | total_train_time_sec = round((endTime - startTime)) 646 | total_train_time_min = round((endTime - startTime)/60,2) 647 | time_per_episode_sec = round((total_train_time_sec)/EPISODES,3) 648 | 649 | # Get Average reward: 650 | average_reward = round(sum(ep_rewards)/len(ep_rewards), 2) 651 | 652 | # Update Results DataFrames: 653 | res = res.append({"Model Name":MODEL_NAME, "Convolution Layers":m["conv_list"], "Dense Layers":m["dense_list"], 654 | "Batch Size":m["MINIBATCH_SIZE"], "ECC":m["ECC_Settings"], "EF":m["EF_Settings"], 655 | "Best Only":m["best_only"], "Average Reward":average_reward, 656 | "Best Average":avg_reward_info[-1][1], "Epsilon 4 Best Average":avg_reward_info[-1][2], 657 | "Best Average On":avg_reward_info[-1][0], "Max Reward":max_reward_info[-1][1], 658 | "Epsilon 4 Max Reward":max_reward_info[-1][2], "Max Reward On":max_reward_info[-1][0], 659 | "Total Training Time (min)":total_train_time_min, "Time Per Episode (sec)":time_per_episode_sec} 660 | , ignore_index=True) 661 | res = res.sort_values(by = 'Best Average') 662 | avg_df = pd.DataFrame(data = avg_reward_info, columns=["Episode", "Average Reward", "Epsilon"]) 663 | max_df = pd.DataFrame(data = max_reward_info, columns=["Episode", "Max Reward", "Epsilon"]) 664 | 665 | # Save dataFrames 666 | res.to_csv(f"{PATH}results/Results.csv") 667 | avg_df.to_csv(f"{PATH}results/{MODEL_NAME}-Results-Avg.csv") 668 | max_df.to_csv(f"{PATH}results/{MODEL_NAME}-Results-Max.csv") 669 | 670 | TendTime = time.time() 671 | ###################################################################################### 672 | print( f"Training took {round((TendTime - TstartTime)/60) } Minutes ") 673 | print( f"Training took {round((TendTime - TstartTime)/3600) } Hours ") 674 | ###################################################################################### 675 | -------------------------------------------------------------------------------- /images/EnvExp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModMaamari/reinforcement-learning-using-python/fd535079d7ca95be856af9a505b327d4350cc0f0/images/EnvExp.jpg -------------------------------------------------------------------------------- /images/gifs/EnvPlayed.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModMaamari/reinforcement-learning-using-python/fd535079d7ca95be856af9a505b327d4350cc0f0/images/gifs/EnvPlayed.gif -------------------------------------------------------------------------------- /images/gifs/envExp.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModMaamari/reinforcement-learning-using-python/fd535079d7ca95be856af9a505b327d4350cc0f0/images/gifs/envExp.gif -------------------------------------------------------------------------------- /images/wall.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModMaamari/reinforcement-learning-using-python/fd535079d7ca95be856af9a505b327d4350cc0f0/images/wall.jpg --------------------------------------------------------------------------------