├── Game-TestHuman.py
├── LICENSE
├── README.md
├── Reinforcement Learning (RL) Using Python.ipynb
├── Train_GridSearch.py
└── images
├── EnvExp.jpg
├── gifs
├── EnvPlayed.gif
└── envExp.gif
└── wall.jpg
/Game-TestHuman.py:
--------------------------------------------------------------------------------
1 | from random import randint
2 | from collections import deque
3 | from time import sleep
4 | import pygame
5 |
6 | pygame.init()
7 |
8 | class Field:
9 | def __init__(self, height=10, width=5):
10 | self.width = width
11 | self.height = height
12 | self.body = [[0] * width for _ in range(height)]
13 |
14 | def update_field(self, walls, player):
15 | self.body = [[0] * self.width for _ in range(self.height)]
16 |
17 | for wall in walls:
18 | if not wall.out_of_range:
19 | for i in range(wall.y, min(wall.y + wall.height, self.height)):
20 | self.body[i][:] = wall.body[i - wall.y][:]
21 |
22 | for i in range(player.y, min(player.y + player.height, self.height)):
23 | for j in range(player.x, min(player.x + player.width, self.width)):
24 | self.body[i][j] = player.body[i - player.y][j - player.x]
25 |
26 |
27 | class Wall:
28 | def __init__(self, height=5, width=100, hole_width=20, y=0, speed=1):
29 | self.height = height
30 | self.width = width
31 | self.hole_width = hole_width
32 | self.y = y
33 | self.speed = speed
34 | self.body_unit = 1
35 | self.body = [[self.body_unit] * width for _ in range(height)]
36 | self.out_of_range = False
37 | self.create_hole()
38 |
39 | def create_hole(self):
40 | hole_pos = randint(0, self.width - self.hole_width)
41 | for i in range(self.hole_width):
42 | self.body[self.height // 2][hole_pos + i] = 0
43 |
44 | def move(self):
45 | self.y += self.speed
46 | self.out_of_range = self.y + self.height > field.height
47 |
48 |
49 | class Player:
50 | def __init__(self, height=5, max_width=10, width=2, x=0, y=0, speed=2):
51 | self.height = height
52 | self.max_width = max_width
53 | self.width = width
54 | self.x = x
55 | self.y = y
56 | self.speed = speed
57 | self.body_unit = 2
58 | self.body = [[self.body_unit] * width for _ in range(height)]
59 |
60 | def move(self, direction=0):
61 | if direction == 1 and self.x > 0:
62 | self.x -= self.speed
63 | elif direction == 2 and self.x + self.width < field.width:
64 | self.x += self.speed
65 |
66 |
67 | class Environment:
68 | def __init__(self):
69 | self.BLACK = (25, 25, 25)
70 | self.WHITE = (255, 255, 255)
71 | self.RED = (255, 80, 80)
72 | self.field = self.walls = self.player = None
73 | self.current_state = self.reset()
74 |
75 | def reset(self):
76 | self.field = Field()
77 | self.walls = deque([Wall()])
78 | self.player = Player(x=field.width // 2 - 1, y=field.height - 5)
79 | return self.field.body
80 |
81 | def step(self, action):
82 | reward = 0
83 |
84 | if action == 1 or action == 2:
85 | self.player.move(action)
86 |
87 | for wall in self.walls:
88 | wall.move()
89 |
90 | self.field.update_field(self.walls, self.player)
91 |
92 | if self.walls[-1].y == self.player.y + self.player.height:
93 | reward += 1
94 |
95 | return self.field.body, reward
96 |
97 | def render(self, window):
98 | window.fill(self.WHITE)
99 |
100 | for r in range(field.height):
101 | for c in range(field.width):
102 | color = self.WHITE if self.field.body[r][c] == 0 else self.BLACK
103 | pygame.draw.rect(window, color, (c * 40, r * 30, 40, 30))
104 |
105 | pygame.display.update()
106 |
107 |
108 | env = Environment()
109 | field = env.field
110 |
111 | WINDOW_WIDTH = field.width * 40
112 | WINDOW_HEIGHT = field.height * 30
113 | WINDOW = pygame.display.set_mode((WINDOW_WIDTH, WINDOW_HEIGHT))
114 |
115 | clock = pygame.time.Clock()
116 | game_over = False
117 |
118 | while not game_over:
119 | clock.tick(27)
120 | env.render(WINDOW)
121 |
122 | for event in pygame.event.get():
123 | if event.type == pygame.QUIT:
124 | game_over = True
125 | elif event.type == pygame.KEYDOWN:
126 | if event.key == pygame.K_LEFT:
127 | env.step(1)
128 | elif event.key == pygame.K_RIGHT:
129 | env.step(2)
130 |
131 | pygame.quit()
132 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Mohammed A. AL-Maamari
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deep Reinforcement Learning (RL) Using Python
2 |
3 | 
4 |
5 | Explanation of the game rules | The game played by a human
6 | :-------------------------:|:-------------------------:
7 |  | 
8 |
9 | In this tutorial series, we are going through every step of building an expert Reinforcement Learning (RL) agent that is capable of playing games.
10 |
11 | This series is divided into three parts:
12 |
13 | * **Part 1**: Designing and Building the Game Environment. In this part we will build a game environment and customize it to make the RL agent able to train on it.
14 |
15 | * **Part 2**: Build and Train the Deep Q Neural Network (DQN). In this part, we define and build the different layers of DQN and train it.
16 |
17 | * **Part 3**: Test and Play the Game.
18 |
19 | We might also try making another simple game environment and use Q-Learning to create an agent that can play this simple game.
20 |
21 | ## The Motivation:
22 |
23 | One time I was in the rabbit hole of YouTube and [THIS VIDEO](https://www.youtube.com/watch?v=k-rWB1jOt9s) was recommended to me, it was about the **sense of self **in human babies, after watching the video a similar question popped into my mind* “Can I develop a smart agent that is smart enough to have a sense of its body and has the ability to change its features to accomplish a certain task?”*
24 |
25 | This series is my way of answering this question.
26 |
27 |
--------------------------------------------------------------------------------
/Reinforcement Learning (RL) Using Python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Reinforcement Learning With Python | Part 1 | Creating The Environment"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Explanation of the game rules | The game played by a human\n",
15 | ":-------------------------:|:-------------------------:\n",
16 | " | "
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "In this tutorial series, we are going through every step of building an expert Reinforcement Learning (RL) agent that is capable to play games.\n",
24 | "\n",
25 | "**This series is divided into three parts:**\n",
26 | "- **Part 1:** Designing and Building the Game Environment. In this part we will build a game environment and customize it to make the RL agent able to train on it.\n",
27 | "- **Part 2:** Build and Train the Deep Q Neural Network (DQN). In this part, we define and build the different layers of DQN and train it.\n",
28 | "- **Part 3:** Test and Play the Game.\n",
29 | "\n",
30 | "We might also try making another simple game environment and use Q-Learning to create an agent that can play this simple game."
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "## Designing the Environment:\n",
38 | "\n",
39 | "For this environment, we want the agent to develop a sense of its body and how to change its body features to avoid losing the game."
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "### First: The Elements of The Environment:\n",
47 | "The Elements of The Environment | .\n",
48 | ":-------------------------:|:-------------------------:\n",
49 | "
"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "#### 1- The Field:\n",
57 | "Contains all the other elements,we represent it in code by class named \"Field\" as follows:"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "class Field:\n",
67 | " def __init__(self, height=10, width=5):\n",
68 | " self.width = width\n",
69 | " self.height = height\n",
70 | " self.body = np.zeros(shape=(self.height, self.width))\n",
71 | " \n",
72 | " def update_field(self,walls, player):\n",
73 | " try:\n",
74 | " # Clear the field:\n",
75 | " self.body = np.zeros(shape=(self.height, self.width))\n",
76 | " # Put the walls on the field:\n",
77 | " for wall in walls:\n",
78 | " if not wall.out_of_range :\n",
79 | " self.body[wall.y:min(wall.y+wall.height,self.height),:] = wall.body\n",
80 | "\n",
81 | " # Put the player on the field:\n",
82 | " self.body[player.y:player.y+player.height,\n",
83 | " player.x:player.x+player.width] += player.body \n",
84 | " except :\n",
85 | " pass"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "**Field attributes:**\n",
93 | "\n",
94 | "* ***width (int)*** : the width of the field (not in pixels)\n",
95 | "\n",
96 | "* ***height (int)*** the height of the field (not in pixels)\n",
97 | "\n",
98 | "* ***body (np.array)*** : holds the array representation of the game elements (player and walls) \n",
99 | "\n",
100 | "This array is passed to the DQN, and also used to draw the interface using pygame.\n",
101 | "
\n",
102 | "**Field methods:**\n",
103 | "\n",
104 | "* ***update_field***(self,walls, player) : updates the field."
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "#### 2- The Walls:"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "class Wall: \n",
121 | " def __init__(self, height = 5, width=100, hole_width = 20,\n",
122 | " y = 0, speed = 1, field = None):\n",
123 | " self.height = height\n",
124 | " self.width = width\n",
125 | " self.hole_width = hole_width\n",
126 | " self.y = y\n",
127 | " self.speed = speed\n",
128 | " self.field = field\n",
129 | " self.body_unit = 1\n",
130 | " self.body = np.ones(shape = (self.height, self.width))*self.body_unit\n",
131 | " self.out_of_range = False\n",
132 | " self.create_hole()\n",
133 | " def create_hole(self):\n",
134 | " hole = np.zeros(shape = (self.height, self.hole_width))\n",
135 | " hole_pos = randint(0,self.width-self.hole_width)\n",
136 | " self.body[ : , hole_pos:hole_pos+self.hole_width] = 0\n",
137 | " def move(self):\n",
138 | " self.y += self.speed\n",
139 | " self.out_of_range = True if ((self.y + self.height) > self.field.height) else False"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "The Wall | .\n",
147 | ":-------------------------:|:-------------------------:\n",
148 | "
"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "**Wall attributes:**\n",
156 | "\n",
157 | "|Attribute |Type |Description |\n",
158 | "|------------|-----------|------------------------------------------------------------------------------------|\n",
159 | "|height |int |the wall's height |\n",
160 | "|width |int |the wall's width ( the same value as the field's width) |\n",
161 | "|hole_width |int |the hole's width (max value of hole_width should be field.width or wall.width) |\n",
162 | "|y |int |the vertical coordinate of the wall (y axis) (max value of y should be field.height)|\n",
163 | "|speed |int |speed of the wall (raw/step) |\n",
164 | "|field |Field |the field that contains the wall |\n",
165 | "|body_unit |int ; float|the number used to represent the wall in the array representation (in field.body) |\n",
166 | "|body |np.array |the wall's body |\n",
167 | "|out_of_range|bool |A flag used to delete the wall when it moves out of the field range. |"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "**Wall methods:**\n",
175 | "\n",
176 | "* ***create_hole***(self): Creates a hole in the wall that its width = self.hole_width.\n",
177 | "* ***move***(self): Moves the wall vertically (every time it get called the wall moves n steps from downward (n = self.speed))"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "#### 3- The Player :"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "class Player:\n",
194 | " def __init__(self, height = 5, max_width = 10 , width=2,\n",
195 | " x = 0, y = 0, speed = 2):\n",
196 | " self.height = height\n",
197 | " self.max_width = max_width\n",
198 | " self.width = width\n",
199 | " self.x = x\n",
200 | " self.y = y\n",
201 | " self.speed = speed\n",
202 | " self.body_unit = 2\n",
203 | " self.body = np.ones(shape = (self.height, self.width))*self.body_unit\n",
204 | " self.stamina = 20\n",
205 | " self.max_stamina = 20\n",
206 | " def move(self, field, direction = 0 ):\n",
207 | " '''\n",
208 | " Moves the player :\n",
209 | " - No change = 0\n",
210 | " - left, if direction = 1\n",
211 | " - right, if direction = 2\n",
212 | " '''\n",
213 | " val2dir = {0:0 , 1:-1 , 2:1}\n",
214 | " direction = val2dir[direction]\n",
215 | " next_x = (self.x + self.speed*direction)\n",
216 | " if not (next_x + self.width > field.width or next_x < 0):\n",
217 | " self.x += self.speed*direction\n",
218 | " self.stamina -= 1 \n",
219 | " def change_width(self, action = 0):\n",
220 | " '''\n",
221 | " Change the player's width:\n",
222 | " - No change = 0\n",
223 | " - narrow by one unit = 3\n",
224 | " - widen by one unit = 4\n",
225 | " '''\n",
226 | " val2act = {0:0 , 3:-1 , 4:1}\n",
227 | " action = val2act[action]\n",
228 | " new_width = self.width+action\n",
229 | " player_end = self.x + new_width\n",
230 | " if new_width <= self.max_width and new_width > 0 and player_end <= self.max_width:\n",
231 | " self.width = new_width\n",
232 | " self.body = np.ones(shape = (self.height, self.width))*self.body_unit"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "**Player attributes:**\n",
240 | "\n",
241 | "|Attribute |Type |Description |\n",
242 | "|------------|-----------|------------------------------------------------------------------------------------|\n",
243 | "|height |int |player's height |\n",
244 | "|max_width |int |player's maximum width (must be less than field.width) |\n",
245 | "|width |int |player's width (must be less than or equal to max_width and begger than 0) |\n",
246 | "|x |int |player's x coordinate in the field |\n",
247 | "|y |int |player's y coordinate in the field |\n",
248 | "|speed |int |player's speed (how many horizontal units it moves per step) |\n",
249 | "|body_unit |int ; float|the number used to represent the player in the array representation (in field.body) |\n",
250 | "|body |np.array |the player's body |\n",
251 | "|stamina |int ; float|player's energy (stamina) (when a player's energy hits zero the player dies) |\n",
252 | "|max_stamina |int ; float|maximum value for player's stamina |"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "**Player methods:**\n",
260 | "* ***move***(self, field, direction = 0 ): Moves the player :\n",
261 | " - direction = 0 -> No change \n",
262 | " - direction = 1 -> left \n",
263 | " - direction = 2 -> right\n",
264 | "* ***change_width***(self, action = 0):\n",
265 | " - action = 0 -> No change\n",
266 | " - action = 3 -> narrow by one unit\n",
267 | " - action = 4 -> widen by one unit"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "---"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "## The \"Environment\" Class :\n",
282 | "This class facilitates the communication between the environment and the agent, it is designed to work either with an RL agent or with a human player."
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {},
288 | "source": [
289 | "### Main Components Needed by the RL Agent:\n",
290 | "- ***ENVIRONMENT_SHAPE*** attribute : used by the DQN to set the shape of the input layer.\n",
291 | "- ***ACTION_SPACE*** attribute : used by the DQN to set the shape of the output layer.\n",
292 | "- ***PUNISHMENT*** and ***REWARD*** : set the values of both punishment and reward, used to train the agent (we use these values to tell the agent if its previous actions were good or bad).\n",
293 | "- ***reset*** method : to reset the environment.\n",
294 | "- ***step*** method: takes an action as an argument and returns next state, reward, a boolean variable named game_over that is used to tell us if the game is over (the player lost) or not.\n",
295 | "\n",
296 | "It is clear that this environment is not different, it subsumes all the required components and more."
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": null,
302 | "metadata": {},
303 | "outputs": [],
304 | "source": [
305 | "class Environment:\n",
306 | " P_HEIGHT = 2 # Height of the player\n",
307 | " F_HEIGHT = 20 # Height of the field\n",
308 | " W_HEIGHT = 2 # Height of the walls\n",
309 | " WIDTH = 10 # Width of the field and the walls\n",
310 | " MIN_H_WIDTH = 2 # Minimum width of the holes\n",
311 | " MAX_H_WIDTH = 6 # Maximum width of the holes\n",
312 | " MIN_P_WIDTH = 2 # Minimum Width of the player\n",
313 | " MAX_P_WIDTH = 6 # Maximum Width of the player\n",
314 | " HEIGHT_MUL = 30 # Height Multiplier (used to draw np.array as blocks in pygame )\n",
315 | " WIDTH_MUL = 40 # Width Multiplier (used to draw np.array as blocks in pygame )\n",
316 | " WINDOW_HEIGHT = (F_HEIGHT+1) * HEIGHT_MUL # Height of the pygame window\n",
317 | " WINDOW_WIDTH = (WIDTH) * WIDTH_MUL # Widh of the pygame window\n",
318 | " \n",
319 | " ENVIRONMENT_SHAPE = (F_HEIGHT,WIDTH,1)\n",
320 | " ACTION_SPACE = [0,1,2,3,4]\n",
321 | " ACTION_SPACE_SIZE = len(ACTION_SPACE)\n",
322 | " PUNISHMENT = -100 # Punishment increment\n",
323 | " REWARD = 10 # Reward increment\n",
324 | " score = 0 # Initial Score\n",
325 | " \n",
326 | " MOVE_WALL_EVERY = 4 # Every how many frames the wall moves.\n",
327 | " MOVE_PLAYER_EVERY = 1 # Every how many frames the player moves.\n",
328 | " frames_counter = 0\n",
329 | "\n",
330 | " def __init__(self):\n",
331 | " # Colors:\n",
332 | " self.BLACK = (25,25,25)\n",
333 | " self.WHITE = (255,255,255)\n",
334 | " self.RED = (255, 80, 80)\n",
335 | " self.BLUE = (80, 80, 255)\n",
336 | " self.field = self.walls = self.player = None\n",
337 | " self.current_state = self.reset()\n",
338 | " self.val2color = {0:self.WHITE, self.walls[0].body_unit:self.BLACK,\n",
339 | " self.player.body_unit:self.BLACK, self.MAX_VAL:self.RED}\n",
340 | " def reset(self):\n",
341 | " self.score = 0\n",
342 | " self.frames_counter = 0\n",
343 | " self.game_over = False\n",
344 | " \n",
345 | " self.field = Field(height=self.F_HEIGHT, width=self.WIDTH )\n",
346 | " w1 = Wall( height = self.W_HEIGHT, width=self.WIDTH,\n",
347 | " hole_width = randint(self.MIN_H_WIDTH,self.MAX_H_WIDTH),\n",
348 | " field = self.field)\n",
349 | " self.walls = deque([w1])\n",
350 | " p_width = randint(self.MIN_P_WIDTH,self.MAX_P_WIDTH)\n",
351 | " self.player = Player( height = self.P_HEIGHT, max_width = self.WIDTH,\n",
352 | " width = p_width,\n",
353 | " x = randint(0,self.field.width-p_width),\n",
354 | " y = int(self.field.height*0.7), speed = 1)\n",
355 | " self.MAX_VAL = self.player.body_unit + w1.body_unit\n",
356 | " # Update the field :\n",
357 | " self.field.update_field(self.walls, self.player)\n",
358 | " \n",
359 | " observation = self.field.body/self.MAX_VAL\n",
360 | " return observation\n",
361 | " def print_text(self, WINDOW = None, text_cords = (0,0), center = False,\n",
362 | " text = \"\", color = (0,0,0), size = 32):\n",
363 | " pygame.init()\n",
364 | " font = pygame.font.Font('freesansbold.ttf', size) \n",
365 | " text_to_print = font.render(text, True, color) \n",
366 | " textRect = text_to_print.get_rect()\n",
367 | " if center:\n",
368 | " textRect.center = text_cords\n",
369 | " else:\n",
370 | " textRect.x = text_cords[0]\n",
371 | " textRect.y = text_cords[1]\n",
372 | " WINDOW.blit(text_to_print, textRect)\n",
373 | " \n",
374 | " def step(self, action):\n",
375 | " global score_increased\n",
376 | "\n",
377 | " self.frames_counter += 1\n",
378 | " reward = 0\n",
379 | "\n",
380 | " # If the performed action is (move) then player.move method is called:\n",
381 | " if action in [1,2]:\n",
382 | " self.player.move(direction = action, field = self.field)\n",
383 | " # If the performed action is (change_width) then player.change_width method is called:\n",
384 | " if action in [3,4]:\n",
385 | " self.player.change_width(action = action) \n",
386 | " \n",
387 | " # Move the wall one step (one step every WALL_SPEED frames):\n",
388 | " if self.frames_counter % self.WALL_SPEED == 0:\n",
389 | " # move the wall one step\n",
390 | " self.walls[-1].move()\n",
391 | " # reset the frames counter\n",
392 | " self.frames_counter = 0\n",
393 | " \n",
394 | " # Update the field :\n",
395 | " self.field.update_field(self.walls, self.player)\n",
396 | "\n",
397 | " # If the player passed a wall successfully increase the reward +1\n",
398 | " if ((self.walls[-1].y) == (self.player.y + self.player.height)) and not score_increased :\n",
399 | " reward += self.REWARD\n",
400 | " self.score += self.REWARD\n",
401 | " \n",
402 | " # Increase player's stamina every time it passed a wall successfully \n",
403 | " self.player.stamina = min(self.player.max_stamina, self.player.stamina+10)\n",
404 | " # score_increased : a flag to make sure that reward increases once per wall \n",
405 | " score_increased = True\n",
406 | " \n",
407 | " \n",
408 | " # Lose Conditions : \n",
409 | " # C1 : The player hits a wall\n",
410 | " # C2 : Player's width was far thinner than hole's width\n",
411 | " # C3 : Player fully consumed its stamina (energy)\n",
412 | " lose_conds = [self.MAX_VAL in self.field.body,\n",
413 | " ((self.player.y == self.walls[-1].y) and (self.player.width < (self.walls[-1].hole_width-1))),\n",
414 | " self.player.stamina <=0]\n",
415 | " \n",
416 | "\n",
417 | " # If one lose condition or more happend, the game ends:\n",
418 | " if True in lose_conds:\n",
419 | " self.game_over = True\n",
420 | " reward = self.PUNISHMENT\n",
421 | " return self.field.body/self.MAX_VAL, reward, self.game_over\n",
422 | "\n",
423 | " # Check if a wall moved out of the scene:\n",
424 | " if self.walls[-1].out_of_range:\n",
425 | " # Create a new wall\n",
426 | " self.walls[-1] = Wall( height = self.W_HEIGHT, width = self.WIDTH,\n",
427 | " hole_width = randint(self.MIN_H_WIDTH,self.MAX_H_WIDTH),\n",
428 | " field = self.field)\n",
429 | "\n",
430 | " score_increased = False\n",
431 | "\n",
432 | " \n",
433 | " # Return New Observation , reward, game_over(bool)\n",
434 | " return self.field.body/self.MAX_VAL, reward, self.game_over\n",
435 | " \n",
436 | " def render(self, WINDOW = None, human=False):\n",
437 | " if human:\n",
438 | " ################ Check Actions #####################\n",
439 | " action = 0\n",
440 | " events = pygame.event.get()\n",
441 | " for event in events:\n",
442 | " if event.type == pygame.QUIT:\n",
443 | " self.game_over = True\n",
444 | " if event.type == pygame.KEYDOWN:\n",
445 | " if event.key == pygame.K_LEFT:\n",
446 | " action = 1\n",
447 | " if event.key == pygame.K_RIGHT:\n",
448 | " action = 2\n",
449 | "\n",
450 | " if event.key == pygame.K_UP:\n",
451 | " action = 4\n",
452 | " if event.key == pygame.K_DOWN:\n",
453 | " action = 3\n",
454 | " ################## Step ############################ \n",
455 | " _,reward, self.game_over = self.step(action)\n",
456 | " ################ Draw Environment ###################\n",
457 | " WINDOW.fill(self.WHITE)\n",
458 | " self.field.update_field(self.walls, self.player)\n",
459 | " for r in range(self.field.body.shape[0]):\n",
460 | " for c in range(self.field.body.shape[1]):\n",
461 | " pygame.draw.rect(WINDOW,\n",
462 | " self.val2color[self.field.body[r][c]],\n",
463 | " (c*self.WIDTH_MUL, r*self.HEIGHT_MUL, self.WIDTH_MUL, self.HEIGHT_MUL))\n",
464 | "\n",
465 | " self.print_text(WINDOW = WINDOW, text_cords = (self.WINDOW_WIDTH // 2, int(self.WINDOW_HEIGHT*0.1)),\n",
466 | " text = str(self.score), color = self.RED, center = True)\n",
467 | " self.print_text(WINDOW = WINDOW, text_cords = (0, int(self.WINDOW_HEIGHT*0.9)),\n",
468 | " text = str(self.player.stamina), color = self.RED)\n",
469 | " \n",
470 | " pygame.display.update()"
471 | ]
472 | },
473 | {
474 | "cell_type": "markdown",
475 | "metadata": {},
476 | "source": [
477 | "**Environment attributes:**\n",
478 | "\n",
479 | "|Attribute |Type |Description |\n",
480 | "|------------|-----------|------------------------------------------------------------------------------------|\n",
481 | "|P_HEIGHT |int |Height of the player |\n",
482 | "|F_HEIGHT |int |Height of the field |\n",
483 | "|W_HEIGHT |int |Height of the walls |\n",
484 | "|WIDTH |int |Width of the field and the walls |\n",
485 | "|MIN_H_WIDTH |int |Minimum width of the holes |\n",
486 | "|MAX_H_WIDTH |int |Maximum width of the holes |\n",
487 | "|MIN_P_WIDTH |int |Minimum Width of the player |\n",
488 | "|MAX_P_WIDTH |int |Maximum Width of the player |\n",
489 | "|HEIGHT_MUL |int |Height Multiplier (used to draw np.array as blocks in pygame ) |\n",
490 | "|WIDTH_MUL |int |Width Multiplier (used to draw np.array as blocks in pygame ) |\n",
491 | "|WINDOW_HEIGHT|int |Height of the pygame window |\n",
492 | "|WINDOW_WIDTH|int |Width of the pygame window |\n",
493 | "|ENVIRONMENT_SHAPE|tuple |(field height ; field width ; 1) |\n",
494 | "|ACTION_SPACE|list |list of actions an agent can perform |\n",
495 | "|ACTION_SPACE_SIZE|int |number of actions an agent can perform |\n",
496 | "|PUNISHMENT |int ; float|Punishment increment |\n",
497 | "|REWARD |int ; float|Reward increment |\n",
498 | "|score |int ; float|Initial Score |\n",
499 | "|MOVE_WALL_EVERY|int |Every how many frames the wall moves. |\n",
500 | "|MOVE_PLAYER_EVERY|int |Every how many frames the player moves. |\n",
501 | "|frames_counter|int |used to handle the wall speed |\n",
502 | "|field |Field | the field object that holds walls and players |\n",
503 | "|walls |double ended queue of Wall objects|a que of walls |\n",
504 | "|player |Player |the player object |\n",
505 | "|current_state|np.array |holds the current state of the field (the array representation of the game field) |\n",
506 | "|val2color |dictionary |used to color the blocks depending on their values (ex: if you want to color the player RED you will put 'self.player.body_unit:RED' in val2color dictionary)|\n",
507 | "|MAX_VAL |int ; float| used to detect collisions between walls and players (MAX_VAL = self.player.body_unit + self.wall.body_unit) |\n"
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {},
513 | "source": [
514 | "***Environment methods:***\n",
515 | "* \\__***init***__(self) : initializes the environment by initializing some attributes and calling the reset method.\n",
516 | "* ***reset***(self) : resets the environment and returns the state of the game field after resetting it.\n",
517 | "* ***print_text***(self, WINDOW = None, text_cords = (0,0), center = False, text = \"\", color = (0,0,0), size = 32): prints a text in a given pygame.display (WINDOW) with the given features.\n",
518 | "---\n",
519 | "**+ step(self, action):**\n",
520 | "\n",
521 | "1. Call the player's move method to move the player.\n",
522 | "2. Call the player's change_width method to move the player.\n",
523 | "3. Move the wall one step.\n",
524 | "4. Update the field.\n",
525 | "5. Check if the player passed a wall successfully. If so, gives the player a reward and increase its stamina.\n",
526 | "6. Check the three losing conditions: the player loses the game if at least one of these three conditions met.\n",
527 | "\n",
528 | "**Losing Conditions**:\n",
529 | "\n",
530 | "|Condition |Explanation|Code |\n",
531 | "|------------|-----------|------------------------------------------------------------------------------------|\n",
532 | "|C1 |The player hits a wall|self.MAX_VAL in self.field.body |\n",
533 | "|C2 |Player's width was far thinner than hole's width|((self.player.y == self.walls[-1].y) and (self.player.width < (self.walls[-1].hole_width-1)))|\n",
534 | "|C3 |Player fully consumed its stamina (energy)|self.player.stamina <=0 |\n",
535 | "\n",
536 | "\n",
537 | "when a player loses, the value of returned reward will equal PUNISHMENT, and the indicator of the game state (game_over) changes from false to true.\n",
538 | "\n",
539 | "7. Check if the current wall hits the bottom of the field, when that happens, the out of range wall is replaced by a new wall.\n",
540 | "8. Return next_state normalized, reward, game_over\n",
541 | "---\n",
542 | "**+render**(self, WINDOW = None, human=False):\n",
543 | "\n",
544 | "**Arguments:**\n",
545 | "* ***WINDOW*** (pygame.display): the pygame.display that the game will be rendered on.\n",
546 | "* ***human*** (bool): If a human will play the game, this argument is set to True, in this case pygame catch pressed keyboard keys to get the action that will be performed.\n",
547 | "\n",
548 | "**Explanation of render method line by line:**\n",
549 | "1. Check if the player is a human. If so, get the pressed key and translate it to the corresponding action (ex: if the right arrow is pressed then set action = 2, that means move the player on step to the right), then call step method to perform the chosen action.\n",
550 | "2. Update the field then start drawing the walls and the player as blocks.\n",
551 | "3. Print the score and the player's stamina.\n",
552 | "4. Finally, update the display to show the rendered screen."
553 | ]
554 | },
555 | {
556 | "cell_type": "markdown",
557 | "metadata": {},
558 | "source": [
559 | "## Finally : Put it all together\n",
560 | "Now we are going to use everything we explained and play the game:\n",
561 | "\n",
562 | "The following code repeats the game until the player wins by getting a score higher than or equals winning_score, or quits the game."
563 | ]
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": null,
568 | "metadata": {},
569 | "outputs": [],
570 | "source": [
571 | "# Make an environment object\n",
572 | "env = Environment()\n",
573 | "# Change wall speed to 3 (one step every 3 frames)\n",
574 | "env.WALL_SPEED = 3\n",
575 | "\n",
576 | "# Initialize some variables \n",
577 | "WINDOW = pygame.display.set_mode((env.WINDOW_WIDTH, env.WINDOW_HEIGHT))\n",
578 | "clock = pygame.time.Clock()\n",
579 | "win = False\n",
580 | "winning_score = 100\n",
581 | "\n",
582 | "# Repeaat the game untill the player win (got a score of winning_score) or quits the game.\n",
583 | "while not win:\n",
584 | " score_increased = False\n",
585 | " game_over = False\n",
586 | " _ = env.reset()\n",
587 | " pygame.display.set_caption(\"Game\")\n",
588 | " while not game_over:\n",
589 | " clock.tick(27)\n",
590 | " env.render(WINDOW = WINDOW, human=True)\n",
591 | " game_over = env.game_over\n",
592 | " #####################################################\n",
593 | " sleep(0.5)\n",
594 | " WINDOW.fill(env.WHITE)\n",
595 | " if env.score >= winning_score:\n",
596 | " win = True\n",
597 | " env.print_text(WINDOW = WINDOW, text_cords = (env.WINDOW_WIDTH // 2, env.WINDOW_HEIGHT// 2),\n",
598 | " text = f\"You Win - Score : {env.score}\", color = env.RED, center = True)\n",
599 | " else:\n",
600 | " env.print_text(WINDOW = WINDOW, text_cords = (env.WINDOW_WIDTH // 2, env.WINDOW_HEIGHT// 2),\n",
601 | " text = f\"Game Over - Score : {env.score}\", color = env.RED, center = True)\n",
602 | " pygame.display.update()"
603 | ]
604 | },
605 | {
606 | "cell_type": "markdown",
607 | "metadata": {},
608 | "source": [
609 | "You can get the full code [HERE](https://github.com/ModMaamari/reinforcement-learning-using-python)"
610 | ]
611 | }
612 | ],
613 | "metadata": {
614 | "kernelspec": {
615 | "display_name": "Python 3",
616 | "language": "python",
617 | "name": "python3"
618 | },
619 | "language_info": {
620 | "codemirror_mode": {
621 | "name": "ipython",
622 | "version": 3
623 | },
624 | "file_extension": ".py",
625 | "mimetype": "text/x-python",
626 | "name": "python",
627 | "nbconvert_exporter": "python",
628 | "pygments_lexer": "ipython3",
629 | "version": "3.7.5"
630 | }
631 | },
632 | "nbformat": 4,
633 | "nbformat_minor": 2
634 | }
635 |
--------------------------------------------------------------------------------
/Train_GridSearch.py:
--------------------------------------------------------------------------------
1 | from random import randint, choice
2 | from collections import deque
3 | from time import sleep
4 | import pygame, time
5 | import numpy as np
6 | import pandas as pd
7 |
8 | from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
9 | from keras.layers import Input, BatchNormalization, GlobalMaxPooling2D
10 | from keras.callbacks import TensorBoard, ModelCheckpoint
11 | import keras.backend.tensorflow_backend as backend
12 | from keras.models import Sequential, Model
13 | from keras.models import load_model
14 | from keras.optimizers import Adam
15 | import tensorflow as tf
16 | from tqdm import tqdm
17 | import random
18 | import os
19 |
20 | # For more repetitive results
21 | random.seed(1)
22 | np.random.seed(1)
23 | tf.random.set_seed(1)
24 |
25 | PATH = ""
26 | # Create models folder
27 | if not os.path.isdir(f'{PATH}models'):
28 | os.makedirs(f'{PATH}models')
29 | # Create results folder
30 | if not os.path.isdir(f'{PATH}results'):
31 | os.makedirs(f'{PATH}results')
32 |
33 | pygame.init()
34 |
35 | TstartTime = time.time()
36 |
37 |
38 | ######################################################################################
39 | class Field:
40 | def __init__(self, height=10, width=5):
41 | self.width = width
42 | self.height = height
43 | self.body = np.zeros(shape=(self.height, self.width))
44 | def update_field(self,walls, player):
45 | try:
46 | # Clear the field:
47 | self.body = np.zeros(shape=(self.height, self.width))
48 | # Put the walls on the field:
49 | for wall in walls:
50 | if not wall.out_of_range :
51 | self.body[wall.y:min(wall.y+wall.height,self.height),:] = wall.body
52 |
53 | # Put the player on the field:
54 | self.body[player.y:player.y+player.height,
55 | player.x:player.x+player.width] += player.body
56 | except :
57 | pass
58 | ######################################################################################
59 | class Wall:
60 | def __init__(self, height = 5, width=100, hole_width = 20,
61 | y = 0, speed = 1, field = None):
62 | self.height = height
63 | self.width = width
64 | self.hole_width = hole_width
65 | self.y = y
66 | self.speed = speed
67 | self.field = field
68 | self.body_unit = 1
69 | self.body = np.ones(shape = (self.height, self.width))*self.body_unit
70 | self.out_of_range = False
71 | self.create_hole()
72 | def create_hole(self):
73 | hole = np.zeros(shape = (self.height, self.hole_width))
74 | hole_pos = randint(0,self.width-self.hole_width)
75 | self.body[ : , hole_pos:hole_pos+self.hole_width] = 0
76 | def move(self):
77 | self.y += self.speed
78 | self.out_of_range = True if ((self.y + self.height) > self.field.height) else False
79 | ######################################################################################
80 | class Player:
81 | def __init__(self, height = 5, max_width = 10 , width=2,
82 | x = 0, y = 0, speed = 2):
83 | self.height = height
84 | self.max_width = max_width
85 | self.width = width
86 | self.x = x
87 | self.y = y
88 | self.speed = speed
89 | self.body_unit = 2
90 | self.body = np.ones(shape = (self.height, self.width))*self.body_unit
91 | self.stamina = 20
92 | self.max_stamina = 20
93 | def move(self, field, direction = 0 ):
94 | '''
95 | Moves the player :
96 | - No change = 0
97 | - left, if direction = 1
98 | - right, if direction = 2
99 | '''
100 | val2dir = {0:0 , 1:-1 , 2:1}
101 | direction = val2dir[direction]
102 | next_x = (self.x + self.speed*direction)
103 | if not (next_x + self.width > field.width or next_x < 0):
104 | self.x += self.speed*direction
105 | self.stamina -= 1
106 | def change_width(self, action = 0):
107 | '''
108 | Change the player's width:
109 | - No change = 0
110 | - narrow by one unit = 3
111 | - widen by one unit = 4
112 | '''
113 | val2act = {0:0 , 3:-1 , 4:1}
114 | action = val2act[action]
115 | new_width = self.width+action
116 | player_end = self.x + new_width
117 | if new_width <= self.max_width and new_width > 0 and player_end <= self.max_width:
118 | self.width = new_width
119 | self.body = np.ones(shape = (self.height, self.width))*self.body_unit
120 | ######################################################################################
121 | class Environment:
122 | P_HEIGHT = 2 # Height of the player
123 | F_HEIGHT = 20 # Height of the field
124 | W_HEIGHT = 2 # Height of the walls
125 | WIDTH = 10 # Width of the field and the walls
126 | MIN_H_WIDTH = 2 # Minimum width of the holes
127 | MAX_H_WIDTH = 6 # Maximum width of the holes
128 | MIN_P_WIDTH = 2 # Minimum Width of the player
129 | MAX_P_WIDTH = 6 # Maximum Width of the player
130 | HEIGHT_MUL = 30 # Height Multiplier (used to draw np.array as blocks in pygame )
131 | WIDTH_MUL = 40 # Width Multiplier (used to draw np.array as blocks in pygame )
132 | WINDOW_HEIGHT = (F_HEIGHT+1) * HEIGHT_MUL # Height of the pygame window
133 | WINDOW_WIDTH = (WIDTH) * WIDTH_MUL # Widh of the pygame window
134 |
135 | ENVIRONMENT_SHAPE = (F_HEIGHT,WIDTH,1)
136 | ACTION_SPACE = [0,1,2,3,4]
137 | ACTION_SPACE_SIZE = len(ACTION_SPACE)
138 | PUNISHMENT = -100 # Punishment increment
139 | REWARD = 10 # Reward increment
140 | score = 0 # Initial Score
141 |
142 | MOVE_WALL_EVERY = 4 # Every how many frames the wall moves.
143 | MOVE_PLAYER_EVERY = 1 # Every how many frames the player moves.
144 | frames_counter = 0
145 |
146 | def __init__(self):
147 | # Colors:
148 | self.BLACK = (25,25,25)
149 | self.WHITE = (255,255,255)
150 | self.RED = (255, 80, 80)
151 | self.BLUE = (80, 80, 255)
152 | self.field = self.walls = self.player = None
153 | self.current_state = self.reset()
154 | self.val2color = {0:self.WHITE, self.walls[0].body_unit:self.BLACK,
155 | self.player.body_unit:self.BLACK, self.MAX_VAL:self.RED}
156 | def reset(self):
157 | self.score = 0
158 | self.frames_counter = 0
159 | self.game_over = False
160 |
161 | self.field = Field(height=self.F_HEIGHT, width=self.WIDTH )
162 | w1 = Wall( height = self.W_HEIGHT, width=self.WIDTH,
163 | hole_width = randint(self.MIN_H_WIDTH,self.MAX_H_WIDTH),
164 | field = self.field)
165 | self.walls = deque([w1])
166 | p_width = randint(self.MIN_P_WIDTH,self.MAX_P_WIDTH)
167 | self.player = Player( height = self.P_HEIGHT, max_width = self.WIDTH,
168 | width = p_width,
169 | x = randint(0,self.field.width-p_width),
170 | y = int(self.field.height*0.7), speed = 1)
171 | self.MAX_VAL = self.player.body_unit + w1.body_unit
172 | # Update the field :
173 | self.field.update_field(self.walls, self.player)
174 |
175 | observation = self.field.body/self.MAX_VAL
176 | return observation
177 | def print_text(self, WINDOW = None, text_cords = (0,0), center = False,
178 | text = "", color = (0,0,0), size = 32):
179 | pygame.init()
180 | font = pygame.font.Font('freesansbold.ttf', size)
181 | text_to_print = font.render(text, True, color)
182 | textRect = text_to_print.get_rect()
183 | if center:
184 | textRect.center = text_cords
185 | else:
186 | textRect.x = text_cords[0]
187 | textRect.y = text_cords[1]
188 | WINDOW.blit(text_to_print, textRect)
189 |
190 | def step(self, action):
191 | global score_increased
192 |
193 | self.frames_counter += 1
194 | reward = 0
195 |
196 | # If the performed action is (move) then player.move method is called:
197 | if action in [1,2]:
198 | self.player.move(direction = action, field = self.field)
199 | # If the performed action is (change_width) then player.change_width method is called:
200 | if action in [3,4]:
201 | self.player.change_width(action = action)
202 |
203 | # Move the wall one step (one step every MOVE_WALL_EVERY frames):
204 | if self.frames_counter % self.MOVE_WALL_EVERY == 0:
205 | # move the wall one step
206 | self.walls[-1].move()
207 | # reset the frames counter
208 | self.frames_counter = 0
209 |
210 | # Update the field :
211 | self.field.update_field(self.walls, self.player)
212 |
213 | # If the player passed a wall successfully increase the reward +1
214 | if ((self.walls[-1].y) == (self.player.y + self.player.height)) and not score_increased :
215 | reward += self.REWARD
216 | self.score += self.REWARD
217 |
218 | # Increase player's stamina every time it passed a wall successfully
219 | self.player.stamina = min(self.player.max_stamina, self.player.stamina+10)
220 | # score_increased : a flag to make sure that reward increases once per wall
221 | score_increased = True
222 |
223 |
224 | # Lose Conditions :
225 | # C1 : The player hits a wall
226 | # C2 : Player's width was far thinner than hole's width
227 | # C3 : Player fully consumed its stamina (energy)
228 | lose_conds = [self.MAX_VAL in self.field.body,
229 | ((self.player.y == self.walls[-1].y) and (self.player.width < (self.walls[-1].hole_width-1))),
230 | self.player.stamina <=0]
231 |
232 |
233 | # If one lose condition or more happend, the game ends:
234 | if True in lose_conds:
235 | self.game_over = True
236 | reward = self.PUNISHMENT
237 | return self.field.body/self.MAX_VAL, reward, self.game_over
238 |
239 | # Check if a wall moved out of the scene:
240 | if self.walls[-1].out_of_range:
241 | # Create a new wall
242 | self.walls[-1] = Wall( height = self.W_HEIGHT, width = self.WIDTH,
243 | hole_width = randint(self.MIN_H_WIDTH,self.MAX_H_WIDTH),
244 | field = self.field)
245 |
246 | score_increased = False
247 |
248 |
249 | # Return New Observation , reward, game_over(bool)
250 | return self.field.body/self.MAX_VAL, reward, self.game_over
251 |
252 | def render(self, WINDOW = None, human=False):
253 | if human:
254 | ################ Check Actions #####################
255 | action = 0
256 | events = pygame.event.get()
257 | for event in events:
258 | if event.type == pygame.QUIT:
259 | self.game_over = True
260 | if event.type == pygame.KEYDOWN:
261 | if event.key == pygame.K_LEFT:
262 | action = 1
263 | if event.key == pygame.K_RIGHT:
264 | action = 2
265 |
266 | if event.key == pygame.K_UP:
267 | action = 4
268 | if event.key == pygame.K_DOWN:
269 | action = 3
270 | ################## Step ############################
271 | _,reward, self.game_over = self.step(action)
272 | ################ Draw Environment ###################
273 | WINDOW.fill(self.WHITE)
274 | self.field.update_field(self.walls, self.player)
275 | for r in range(self.field.body.shape[0]):
276 | for c in range(self.field.body.shape[1]):
277 | pygame.draw.rect(WINDOW,
278 | self.val2color[self.field.body[r][c]],
279 | (c*self.WIDTH_MUL, r*self.HEIGHT_MUL, self.WIDTH_MUL, self.HEIGHT_MUL))
280 |
281 | self.print_text(WINDOW = WINDOW, text_cords = (self.WINDOW_WIDTH // 2, int(self.WINDOW_HEIGHT*0.1)),
282 | text = str(self.score), color = self.RED, center = True)
283 | self.print_text(WINDOW = WINDOW, text_cords = (0, int(self.WINDOW_HEIGHT*0.9)),
284 | text = str(self.player.stamina), color = self.RED)
285 |
286 | pygame.display.update()
287 | ######################################################################################
288 | class ModifiedTensorBoard(TensorBoard):
289 | # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
290 | def __init__(self, name, **kwargs):
291 | super().__init__(**kwargs)
292 | self.step = 1
293 | self.writer = tf.summary.create_file_writer(self.log_dir)
294 | self._log_write_dir = os.path.join(self.log_dir, name)
295 |
296 | # Overriding this method to stop creating default log writer
297 | def set_model(self, model):
298 | pass
299 |
300 | # Overrided, saves logs with our step number
301 | # (otherwise every .fit() will start writing from 0th step)
302 | def on_epoch_end(self, epoch, logs=None):
303 | self.update_stats(**logs)
304 |
305 | # Overrided
306 | # We train for one batch only, no need to save anything at epoch end
307 | def on_batch_end(self, batch, logs=None):
308 | pass
309 |
310 | # Overrided, so won't close writer
311 | def on_train_end(self, _):
312 | pass
313 |
314 | def on_train_batch_end(self, batch, logs=None):
315 | pass
316 |
317 | # Custom method for saving own metrics
318 | # Creates writer, writes custom metrics and closes writer
319 | def update_stats(self, **stats):
320 | self._write_logs(stats, self.step)
321 |
322 | def _write_logs(self, logs, index):
323 | with self.writer.as_default():
324 | for name, value in logs.items():
325 | tf.summary.scalar(name, value, step=index)
326 | self.step += 1
327 | self.writer.flush()
328 | ######################################################################################
329 | # Agent class
330 | class DQNAgent:
331 | def __init__(self, name, env, conv_list, dense_list, util_list):
332 | self.env = env
333 | self.conv_list = conv_list
334 | self.dense_list = dense_list
335 | self.name = [str(name) +" | " + "".join(str(c)+"C | " for c in conv_list) + "".join(str(d) + "D | " for d in dense_list) + "".join(u + " | " for u in util_list) ][0]
336 |
337 | # Main model
338 | self.model = self.create_model(self.conv_list, self.dense_list)
339 |
340 | # Target network
341 | self.target_model = self.create_model(self.conv_list, self.dense_list)
342 | self.target_model.set_weights(self.model.get_weights())
343 |
344 | # An array with last n steps for training
345 | self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
346 |
347 | # Custom tensorboard object
348 | self.tensorboard = ModifiedTensorBoard(name, log_dir="{}logs/{}-{}".format(PATH, name, int(time.time())))
349 |
350 | # Used to count when to update target network with main network's weights
351 | self.target_update_counter = 0
352 |
353 |
354 | # Creates a convolutional block given (filters) number of filters, (dropout) dropout rate,
355 | # (bn) a boolean variable indecating the use of BatchNormalization,
356 | # (pool) a boolean variable indecating the use of MaxPooling2D
357 | def conv_block(self, inp, filters=64, bn=True, pool=True, dropout = 0.2):
358 | _ = Conv2D(filters=filters, kernel_size=3, activation='relu')(inp)
359 | if bn:
360 | _ = BatchNormalization()(_)
361 | if pool:
362 | _ = MaxPooling2D(pool_size=(2, 2))(_)
363 | if dropout > 0:
364 | _ = Dropout(0.2)(_)
365 | return _
366 | # Creates the model with the given specifications:
367 | def create_model(self, conv_list, dense_list):
368 | # Defines the input layer with shape = ENVIRONMENT_SHAPE
369 | input_layer = Input(shape=self.env.ENVIRONMENT_SHAPE)
370 | # Defines the first convolutional block:
371 | _ = self.conv_block(input_layer, filters=conv_list[0], bn=False, pool=False)
372 | # If number of convolutional layers is 2 or more, use a loop to create them.
373 | if len(conv_list)>1:
374 | for c in conv_list[1:]:
375 | _ = self.conv_block(_, filters=c)
376 | # Flatten the output of the last convolutional layer.
377 | _ = Flatten()(_)
378 |
379 | # Creating the dense layers:
380 | for d in dense_list:
381 | _ = Dense(units=d, activation='relu')(_)
382 | # The output layer has 5 nodes (one node per action)
383 | output = Dense(units=self.env.ACTION_SPACE_SIZE,
384 | activation='linear', name='output')(_)
385 |
386 | # Put it all together:
387 | model = Model(inputs=input_layer, outputs=[output])
388 | model.compile(optimizer=Adam(lr=0.001),
389 | loss={'output': 'mse'},
390 | metrics={'output': 'accuracy'})
391 |
392 | return model
393 |
394 | # Adds step's data to a memory replay array
395 | # (observation space, action, reward, new observation space, done)
396 | def update_replay_memory(self, transition):
397 | self.replay_memory.append(transition)
398 |
399 | # Trains main network every step during episode
400 | def train(self, terminal_state, step):
401 | # Start training only if certain number of samples is already saved
402 | if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
403 | return
404 |
405 | # Get a minibatch of random samples from memory replay table
406 | minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
407 |
408 | # Get current states from minibatch, then query NN model for Q values
409 | current_states = np.array([transition[0] for transition in minibatch])
410 | current_qs_list = self.model.predict(current_states.reshape(-1, *env.ENVIRONMENT_SHAPE))
411 |
412 |
413 | # Get future states from minibatch, then query NN model for Q values
414 | # When using target network, query it, otherwise main network should be queried
415 | new_current_states = np.array([transition[3] for transition in minibatch])
416 | future_qs_list = self.target_model.predict(new_current_states.reshape(-1, *env.ENVIRONMENT_SHAPE))
417 |
418 | X = []
419 | y = []
420 |
421 | # Now we need to enumerate our batches
422 | for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
423 |
424 | # If not a terminal state, get new q from future states, otherwise set it to 0
425 | # almost like with Q Learning, but we use just part of equation here
426 | if not done:
427 | max_future_q = np.max(future_qs_list[index])
428 | new_q = reward + DISCOUNT * max_future_q
429 | else:
430 | new_q = reward
431 |
432 | # Update Q value for given state
433 | current_qs = current_qs_list[index]
434 | current_qs[action] = new_q
435 |
436 | # And append to our training data
437 | X.append(current_state)
438 | y.append(current_qs)
439 |
440 |
441 | # Fit on all samples as one batch, log only on terminal state
442 | self.model.fit(x = np.array(X).reshape(-1, *env.ENVIRONMENT_SHAPE),
443 | y = np.array(y),
444 | batch_size = MINIBATCH_SIZE, verbose = 0,
445 | shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
446 |
447 | # Update target network counter every episode
448 | if terminal_state:
449 | self.target_update_counter += 1
450 |
451 | # If counter reaches set value, update target network with weights of main network
452 | if self.target_update_counter > UPDATE_TARGET_EVERY:
453 | self.target_model.set_weights(self.model.get_weights())
454 | self.target_update_counter = 0
455 |
456 | # Queries main network for Q values given current observation space (environment state)
457 | def get_qs(self, state):
458 | return self.model.predict(state.reshape(-1, *env.ENVIRONMENT_SHAPE))
459 | ######################################################################################
460 | def save_model_and_weights(agent, model_name, episode, max_reward, average_reward, min_reward):
461 | checkpoint_name = f"{model_name}| Eps({episode}) | max({max_reward:_>7.2f}) | avg({average_reward:_>7.2f}) | min({min_reward:_>7.2f}).model"
462 | agent.model.save(f'{PATH}models/{checkpoint_name}')
463 | best_weights = agent.model.get_weights()
464 | return best_weights
465 | ######################################################################################
466 | # ## Constants:
467 | # RL Constants:
468 | DISCOUNT = 0.99
469 | REPLAY_MEMORY_SIZE = 3_000 # How many last steps to keep for model training
470 | MIN_REPLAY_MEMORY_SIZE = 1_000 # Minimum number of steps in a memory to start training
471 | UPDATE_TARGET_EVERY = 20 # Terminal states (end of episodes)
472 | MIN_REWARD = 1000 # For model save
473 | SAVE_MODEL_EVERY = 1000 # Episodes
474 | SHOW_EVERY = 20 # Episodes
475 | EPISODES = 100 # Number of episodes
476 | # Stats settings
477 | AGGREGATE_STATS_EVERY = 20 # episodes
478 | SHOW_PREVIEW = False
479 | ######################################################################################
480 | # Models Arch :
481 | # [{[conv_list], [dense_list], [util_list], MINIBATCH_SIZE, {EF_Settings}, {ECC_Settings}} ]
482 |
483 | models_arch = [ {"conv_list":[32], "dense_list":[32,32], "util_list":["ECC2", "1A-5Ac"],
484 | "MINIBATCH_SIZE":128, "best_only":False,
485 | "EF_Settings":{"EF_Enabled":False}, "ECC_Settings":{"ECC_Enabled":False}},
486 |
487 | {"conv_list":[32], "dense_list":[32,32,32], "util_list":["ECC2", "1A-5Ac"],
488 | "MINIBATCH_SIZE":128, "best_only":False,
489 | "EF_Settings":{"EF_Enabled":False}, "ECC_Settings":{"ECC_Enabled":False}},
490 |
491 | {"conv_list":[32], "dense_list":[32,32], "util_list":["ECC2", "1A-5Ac"],
492 | "MINIBATCH_SIZE":128, "best_only":False,
493 | "EF_Settings":{"EF_Enabled":True, "FLUCTUATIONS":2},
494 | "ECC_Settings":{"ECC_Enabled":True, "MAX_EPS_NO_INC":int(EPISODES*0.2)}}]
495 |
496 | # A dataframe used to store grid search results
497 | res = pd.DataFrame(columns = ["Model Name","Convolution Layers", "Dense Layers", "Batch Size", "ECC", "EF",
498 | "Best Only" , "Average Reward", "Best Average", "Epsilon 4 Best Average",
499 | "Best Average On", "Max Reward", "Epsilon 4 Max Reward", "Max Reward On",
500 | "Total Training Time (min)", "Time Per Episode (sec)"])
501 | ######################################################################################
502 | # Grid Search:
503 | for i, m in enumerate(models_arch):
504 | startTime = time.time() # Used to count episode training time
505 | MINIBATCH_SIZE = m["MINIBATCH_SIZE"]
506 |
507 | # Exploration settings :
508 | # Epsilon Fluctuation (EF):
509 | EF_Enabled = m["EF_Settings"]["EF_Enabled"] # Enable Epsilon Fluctuation
510 | MAX_EPSILON = 1 # Maximum epsilon value
511 | MIN_EPSILON = 0.001 # Minimum epsilon value
512 | if EF_Enabled:
513 | FLUCTUATIONS = m["EF_Settings"]["FLUCTUATIONS"] # How many times epsilon will fluctuate
514 | FLUCTUATE_EVERY = int(EPISODES/FLUCTUATIONS) # Episodes
515 | EPSILON_DECAY = MAX_EPSILON - (MAX_EPSILON/FLUCTUATE_EVERY)
516 | epsilon = 1 # not a constant, going to be decayed
517 | else:
518 | EPSILON_DECAY = MAX_EPSILON - (MAX_EPSILON/(0.8*EPISODES))
519 | epsilon = 1 # not a constant, going to be decayed
520 |
521 | # Initialize some variables:
522 | best_average = -100
523 | best_score = -100
524 |
525 | # Epsilon Conditional Constantation (ECC):
526 | ECC_Enabled = m["ECC_Settings"]["ECC_Enabled"]
527 | avg_reward_info = [[1, best_average, epsilon]] # [[episode1, reward1 , epsilon1] ... [episode_n, reward_n , epsilon_n]]
528 | max_reward_info = [[1, best_score , epsilon]]
529 | if ECC_Enabled : MAX_EPS_NO_INC = m["ECC_Settings"]["MAX_EPS_NO_INC"] # Maximum number of episodes without any increment in reward average
530 | eps_no_inc_counter = 0 # Counts episodes with no increment in reward
531 |
532 |
533 | # For stats
534 | ep_rewards = [best_average]
535 |
536 |
537 |
538 | env = Environment()
539 | env.MOVE_WALL_EVERY = 1 # Every how many frames the wall moves.
540 |
541 |
542 | agent = DQNAgent(f"M{i}", env, m["conv_list"], m["dense_list"], m["util_list"])
543 | MODEL_NAME = agent.name
544 |
545 |
546 | best_weights = [agent.model.get_weights()]
547 |
548 | # Uncomment these two lines if you want to show preview on your screen
549 | # WINDOW = pygame.display.set_mode((env.WINDOW_WIDTH, env.WINDOW_HEIGHT))
550 | # clock = pygame.time.Clock()
551 |
552 | # Iterate over episodes
553 | for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
554 | if m["best_only"]: agent.model.set_weights(best_weights[0])
555 | # agent.target_model.set_weights(best_weights[0])
556 |
557 | score_increased = False
558 | # Update tensorboard step every episode
559 | agent.tensorboard.step = episode
560 |
561 | # Restarting episode - reset episode reward and step number
562 | episode_reward = 0
563 | step = 1
564 | action = 0
565 | # Reset environment and get initial state
566 | current_state = env.reset()
567 | game_over = env.game_over
568 | while not game_over:
569 | # This part stays mostly the same, the change is to query a model for Q values
570 | if np.random.random() > epsilon:
571 | # Get action from Q table
572 | action = np.argmax(agent.get_qs(current_state))
573 |
574 | else:
575 | # Get random action
576 | action = choice(env.ACTION_SPACE)
577 |
578 | new_state, reward, game_over = env.step(action)
579 |
580 | # Transform new continuous state to new discrete state and count reward
581 | episode_reward += reward
582 |
583 | # Uncomment the next block if you want to show preview on your screen
584 | # if SHOW_PREVIEW and not episode % SHOW_EVERY:
585 | # clock.tick(27)
586 | # env.render(WINDOW)
587 |
588 | # Every step we update replay memory and train main network
589 | agent.update_replay_memory((current_state, action, reward, new_state, game_over))
590 | agent.train(game_over, step)
591 |
592 | current_state = new_state
593 | step += 1
594 |
595 | if ECC_Enabled : eps_no_inc_counter += 1
596 | # Append episode reward to a list and log stats (every given number of episodes)
597 | ep_rewards.append(episode_reward)
598 |
599 | if not episode % AGGREGATE_STATS_EVERY:
600 | average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
601 | min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
602 | max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
603 | agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
604 |
605 | # Save models, but only when avg reward is greater or equal a set value
606 | if not episode % SAVE_MODEL_EVERY:
607 | # Save Agent :
608 | _ = save_model_and_weights(agent, MODEL_NAME, episode, max_reward, average_reward, min_reward)
609 |
610 |
611 | if average_reward > best_average:
612 | best_average = average_reward
613 | # update ECC variables:
614 | avg_reward_info.append([episode, best_average, epsilon])
615 | eps_no_inc_counter = 0
616 | # Save Agent :
617 | best_weights[0] = save_model_and_weights(agent, MODEL_NAME, episode, max_reward, average_reward, min_reward)
618 |
619 | if ECC_Enabled and eps_no_inc_counter >= MAX_EPS_NO_INC:
620 | epsilon = avg_reward_info[-1][2] # Get epsilon value of the last best reward
621 | eps_no_inc_counter = 0
622 |
623 | if episode_reward > best_score:
624 | try:
625 | best_score = episode_reward
626 | max_reward_info.append([episode, best_score, epsilon])
627 |
628 | # Save Agent :
629 | best_weights[0] = save_model_and_weights(agent, MODEL_NAME, episode, max_reward, average_reward, min_reward)
630 |
631 | except:
632 | pass
633 |
634 | # Decay epsilon
635 | if epsilon > MIN_EPSILON:
636 | epsilon *= EPSILON_DECAY
637 | epsilon = max(MIN_EPSILON, epsilon)
638 |
639 | # Epsilon Fluctuation:
640 | if EF_Enabled:
641 | if not episode % FLUCTUATE_EVERY:
642 | epsilon = MAX_EPSILON
643 |
644 | endTime = time.time()
645 | total_train_time_sec = round((endTime - startTime))
646 | total_train_time_min = round((endTime - startTime)/60,2)
647 | time_per_episode_sec = round((total_train_time_sec)/EPISODES,3)
648 |
649 | # Get Average reward:
650 | average_reward = round(sum(ep_rewards)/len(ep_rewards), 2)
651 |
652 | # Update Results DataFrames:
653 | res = res.append({"Model Name":MODEL_NAME, "Convolution Layers":m["conv_list"], "Dense Layers":m["dense_list"],
654 | "Batch Size":m["MINIBATCH_SIZE"], "ECC":m["ECC_Settings"], "EF":m["EF_Settings"],
655 | "Best Only":m["best_only"], "Average Reward":average_reward,
656 | "Best Average":avg_reward_info[-1][1], "Epsilon 4 Best Average":avg_reward_info[-1][2],
657 | "Best Average On":avg_reward_info[-1][0], "Max Reward":max_reward_info[-1][1],
658 | "Epsilon 4 Max Reward":max_reward_info[-1][2], "Max Reward On":max_reward_info[-1][0],
659 | "Total Training Time (min)":total_train_time_min, "Time Per Episode (sec)":time_per_episode_sec}
660 | , ignore_index=True)
661 | res = res.sort_values(by = 'Best Average')
662 | avg_df = pd.DataFrame(data = avg_reward_info, columns=["Episode", "Average Reward", "Epsilon"])
663 | max_df = pd.DataFrame(data = max_reward_info, columns=["Episode", "Max Reward", "Epsilon"])
664 |
665 | # Save dataFrames
666 | res.to_csv(f"{PATH}results/Results.csv")
667 | avg_df.to_csv(f"{PATH}results/{MODEL_NAME}-Results-Avg.csv")
668 | max_df.to_csv(f"{PATH}results/{MODEL_NAME}-Results-Max.csv")
669 |
670 | TendTime = time.time()
671 | ######################################################################################
672 | print( f"Training took {round((TendTime - TstartTime)/60) } Minutes ")
673 | print( f"Training took {round((TendTime - TstartTime)/3600) } Hours ")
674 | ######################################################################################
675 |
--------------------------------------------------------------------------------
/images/EnvExp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModMaamari/reinforcement-learning-using-python/fd535079d7ca95be856af9a505b327d4350cc0f0/images/EnvExp.jpg
--------------------------------------------------------------------------------
/images/gifs/EnvPlayed.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModMaamari/reinforcement-learning-using-python/fd535079d7ca95be856af9a505b327d4350cc0f0/images/gifs/EnvPlayed.gif
--------------------------------------------------------------------------------
/images/gifs/envExp.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModMaamari/reinforcement-learning-using-python/fd535079d7ca95be856af9a505b327d4350cc0f0/images/gifs/envExp.gif
--------------------------------------------------------------------------------
/images/wall.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModMaamari/reinforcement-learning-using-python/fd535079d7ca95be856af9a505b327d4350cc0f0/images/wall.jpg
--------------------------------------------------------------------------------