├── LICENSE ├── PyGame-Learning-Environment └── ple │ └── games │ └── flappybird │ ├── __init__.py │ └── assets │ ├── blackpaddle-down.png │ ├── blackpaddle-up.png │ ├── bluepaddle-down.png │ ├── bluepaddle-up.png │ ├── redpaddle-down.png │ └── redpaddle-up.png ├── README.md ├── dqn2.py ├── dqn3.py ├── dqnconcat.py ├── img ├── 1.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png └── 7.gif ├── model3_8400_1256.2.ckpt ├── modelconcat_30900_157.6.ckpt ├── modelsm_16400_95.8.ckpt ├── requirement.txt └── run3Agents.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 ninetailskim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PyGame-Learning-Environment/ple/games/flappybird/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | 5 | import pygame 6 | from pygame.constants import K_w 7 | from .. import base 8 | 9 | 10 | class BirdPlayer(pygame.sprite.Sprite): 11 | 12 | def __init__(self, 13 | SCREEN_WIDTH, SCREEN_HEIGHT, init_pos, 14 | image_assets, rng, color="red", scale=1.0): 15 | 16 | self.SCREEN_WIDTH = SCREEN_WIDTH 17 | self.SCREEN_HEIGHT = SCREEN_HEIGHT 18 | 19 | self.image_order = [0, 1, 2, 1] 20 | # done image stuff 21 | 22 | pygame.sprite.Sprite.__init__(self) 23 | 24 | self.image_assets = image_assets 25 | 26 | self.init(init_pos, color) 27 | 28 | self.height = self.image.get_height() 29 | self.scale = scale 30 | 31 | # all in terms of y 32 | self.vel = 0 33 | self.FLAP_POWER = 9 * self.scale 34 | self.MAX_DROP_SPEED = 10.0 35 | self.GRAVITY = 1.0 * self.scale 36 | 37 | self.rng = rng 38 | 39 | self._oscillateStartPos() # makes the direction and position random 40 | self.rect.center = (self.pos_x, self.pos_y) # could be done better 41 | 42 | self.lives = 1 43 | self.score = 0 44 | 45 | def init(self, init_pos, color): 46 | # set up the surface we draw the bird too 47 | self.flapped = True # start off w/ a flap 48 | self.current_image = 0 49 | self.color = color 50 | self.image = self.image_assets[self.color][self.current_image] 51 | self.rect = self.image.get_rect() 52 | self.thrust_time = 0.0 53 | self.game_tick = 0 54 | self.pos_x = init_pos[0] 55 | self.pos_y = init_pos[1] 56 | self.lives = 1 57 | self.score = 0 58 | 59 | def _oscillateStartPos(self): 60 | offset = 8 * np.sin(self.rng.rand() * np.pi) 61 | self.pos_y += offset 62 | 63 | def flap(self): 64 | if self.pos_y > -2.0 * self.image.get_height(): 65 | self.vel = 0.0 66 | self.flapped = True 67 | 68 | def update(self, dt): 69 | self.game_tick += 1 70 | 71 | # image cycle 72 | if (self.game_tick + 1) % 15 == 0: 73 | self.current_image += 1 74 | 75 | if self.current_image >= 2: 76 | self.current_image = 0 77 | 78 | # set the image to draw with. 79 | self.image = self.image_assets[self.color][self.current_image] 80 | self.rect = self.image.get_rect() 81 | 82 | if self.vel < self.MAX_DROP_SPEED and self.thrust_time == 0.0: 83 | self.vel += self.GRAVITY 84 | 85 | # the whole point is to spread this out over the same time it takes in 86 | # 30fps. 87 | if self.thrust_time + dt <= (1.0 / 30.0) and self.flapped: 88 | self.thrust_time += dt 89 | self.vel += -1.0 * self.FLAP_POWER 90 | else: 91 | self.thrust_time = 0.0 92 | self.flapped = False 93 | 94 | self.pos_y += self.vel 95 | self.rect.center = (self.pos_x, self.pos_y) 96 | 97 | def draw(self, screen): 98 | screen.blit(self.image, self.rect.center) 99 | 100 | 101 | class Pipe(pygame.sprite.Sprite): 102 | 103 | def __init__(self, 104 | SCREEN_WIDTH, SCREEN_HEIGHT, gap_start, gap_size, image_assets, scale, 105 | offset=0, color="green"): 106 | 107 | self.speed = 4.0 * scale 108 | self.SCREEN_WIDTH = SCREEN_WIDTH 109 | self.SCREEN_HEIGHT = SCREEN_HEIGHT 110 | 111 | self.image_assets = image_assets 112 | # done image stuff 113 | 114 | self.width = self.image_assets["green"]["lower"].get_width() 115 | pygame.sprite.Sprite.__init__(self) 116 | 117 | self.image = pygame.Surface((self.width, self.SCREEN_HEIGHT)) 118 | self.image.set_colorkey((0, 0, 0)) 119 | 120 | self.init(gap_start, gap_size, offset, color) 121 | 122 | def init(self, gap_start, gap_size, offset, color): 123 | self.image.fill((0, 0, 0)) 124 | self.gap_start = gap_start 125 | self.x = self.SCREEN_WIDTH + self.width + offset 126 | 127 | self.lower_pipe = self.image_assets[color]["lower"] 128 | self.upper_pipe = self.image_assets[color]["upper"] 129 | 130 | top_bottom = gap_start - self.upper_pipe.get_height() 131 | bottom_top = gap_start + gap_size 132 | 133 | self.image.blit(self.upper_pipe, (0, top_bottom)) 134 | self.image.blit(self.lower_pipe, (0, bottom_top)) 135 | 136 | self.rect = self.image.get_rect() 137 | self.rect.center = (self.x, self.SCREEN_HEIGHT / 2) 138 | 139 | def update(self, dt): 140 | self.x -= self.speed 141 | self.rect.center = (self.x, self.SCREEN_HEIGHT / 2) 142 | 143 | 144 | class Backdrop(): 145 | 146 | def __init__(self, SCREEN_WIDTH, SCREEN_HEIGHT, 147 | image_background, image_base, scale): 148 | self.SCREEN_WIDTH = SCREEN_WIDTH 149 | self.SCREEN_HEIGHT = SCREEN_HEIGHT 150 | 151 | self.background_image = image_background 152 | self.base_image = image_base 153 | 154 | self.x = 0 155 | self.speed = 4.0 * scale 156 | self.max_move = self.base_image.get_width() - self.background_image.get_width() 157 | 158 | def update_draw_base(self, screen, dt): 159 | # the extra is on the right 160 | if self.x > -1 * self.max_move: 161 | self.x -= self.speed 162 | else: 163 | self.x = 0 164 | 165 | screen.blit(self.base_image, (self.x, self.SCREEN_HEIGHT * 0.79)) 166 | 167 | def draw_background(self, screen): 168 | screen.blit(self.background_image, (0, 0)) 169 | 170 | 171 | class FlappyBird(base.PyGameWrapper): 172 | """ 173 | Used physics values from sourabhv's `clone`_. 174 | 175 | .. _clone: https://github.com/sourabhv/FlapPyBird 176 | 177 | 178 | Parameters 179 | ---------- 180 | width : int (default: 288) 181 | Screen width. Consistent gameplay is not promised for different widths or heights, therefore the width and height should not be altered. 182 | 183 | height : inti (default: 512) 184 | Screen height. 185 | 186 | pipe_gap : int (default: 100) 187 | The gap in pixels left between the top and bottom pipes. 188 | 189 | """ 190 | 191 | def __init__(self, width=288, height=512, pipe_gap=100): 192 | 193 | actions = { 194 | "_0": 0, 195 | "_1": 1, 196 | "_2": 2, 197 | "_12": 3, 198 | "_3": 4, 199 | "_13": 5, 200 | "_23": 6, 201 | "_123": 7, 202 | } 203 | 204 | fps = 30 205 | 206 | base.PyGameWrapper.__init__(self, width, height, actions=actions) 207 | 208 | self.scale = 30.0 / fps 209 | 210 | self.allowed_fps = 30 # restrict the fps 211 | 212 | self.pipe_gap = pipe_gap 213 | self.pipe_color = "red" 214 | self.images = {} 215 | 216 | # so we can preload images 217 | pygame.display.set_mode((1, 1), pygame.NOFRAME) 218 | 219 | self._dir_ = os.path.dirname(os.path.abspath(__file__)) 220 | self._asset_dir = os.path.join(self._dir_, "assets/") 221 | self._load_images() 222 | 223 | self.pipe_offsets = [0, self.width * 0.5, self.width] 224 | self.init_pos = ( 225 | int(self.width * 0.2), 226 | int(self.height / 2) 227 | ) 228 | 229 | self.pipe_min = int(self.pipe_gap / 4) 230 | self.pipe_max = int(self.height * 0.79 * 0.6 - self.pipe_gap / 2) 231 | 232 | self.backdrop = None 233 | self.player = None 234 | self.pipe_group = None 235 | 236 | def _load_images(self): 237 | # preload and convert all the images so its faster when we reset 238 | self.images["player"] = {} 239 | for c in ["red", "blue", "black"]: 240 | image_assets = [ 241 | os.path.join(self._asset_dir, "%spaddle-up.png" % c), 242 | os.path.join(self._asset_dir, "%spaddle-down.png" % c), 243 | ] 244 | 245 | self.images["player"][c] = [pygame.image.load( 246 | im).convert_alpha() for im in image_assets] 247 | 248 | self.images["background"] = {} 249 | for b in ["day", "night"]: 250 | path = os.path.join(self._asset_dir, "background-%s.png" % b) 251 | 252 | self.images["background"][b] = pygame.image.load(path).convert() 253 | 254 | self.images["pipes"] = {} 255 | for c in ["red", "green"]: 256 | path = os.path.join(self._asset_dir, "pipe-%s.png" % c) 257 | 258 | self.images["pipes"][c] = {} 259 | self.images["pipes"][c]["lower"] = pygame.image.load( 260 | path).convert_alpha() 261 | self.images["pipes"][c]["upper"] = pygame.transform.rotate( 262 | self.images["pipes"][c]["lower"], 180) 263 | 264 | path = os.path.join(self._asset_dir, "base.png") 265 | self.images["base"] = pygame.image.load(path).convert() 266 | 267 | def init(self): 268 | if self.backdrop is None: 269 | self.backdrop = Backdrop( 270 | self.width, 271 | self.height, 272 | self.images["background"]["day"], 273 | self.images["base"], 274 | self.scale 275 | ) 276 | 277 | if self.player is None: 278 | self.player = [BirdPlayer( 279 | self.width, 280 | self.height, 281 | self.init_pos, 282 | self.images["player"], 283 | self.rng, 284 | color=cc, 285 | scale=self.scale 286 | ) for cc in ["red", "blue", "black"]] 287 | 288 | if self.pipe_group is None: 289 | self.pipe_group = pygame.sprite.Group([ 290 | self._generatePipes(offset=-75), 291 | self._generatePipes(offset=-75 + self.width / 2), 292 | self._generatePipes(offset=-75 + self.width * 1.5) 293 | ]) 294 | 295 | color = self.rng.choice(["day", "night"]) 296 | self.backdrop.background_image = self.images["background"][color] 297 | 298 | # instead of recreating 299 | self.player[0].init(self.init_pos, "red") 300 | self.player[1].init(self.init_pos, "blue") 301 | self.player[2].init(self.init_pos, "black") 302 | 303 | self.pipe_color = self.rng.choice(["red", "green"]) 304 | for i, p in enumerate(self.pipe_group): 305 | self._generatePipes(offset=self.pipe_offsets[i], pipe=p) 306 | 307 | self.socre = 0.0 308 | self.lives = 1 309 | self.game_tick = 0 310 | 311 | # for pl in self.player: 312 | # print(pl.color) 313 | def getGameState(self): 314 | """ 315 | Gets a non-visual state representation of the game. 316 | 317 | Returns 318 | ------- 319 | 320 | dict 321 | * player y position. 322 | * players velocity. 323 | * next pipe distance to player 324 | * next pipe top y position 325 | * next pipe bottom y position 326 | * next next pipe distance to player 327 | * next next pipe top y position 328 | * next next pipe bottom y position 329 | 330 | 331 | See code for structure. 332 | 333 | """ 334 | pipes0 = [] 335 | for p in self.pipe_group: 336 | if p.x + p.width/2 > self.player[0].pos_x : 337 | pipes0.append((p, p.x + p.width/2 - self.player[0].pos_x )) 338 | 339 | pipes0.sort(key=lambda p: p[1]) 340 | 341 | next_pipe0 = pipes0[1][0] 342 | next_next_pipe0 = pipes0[0][0] 343 | 344 | if next_next_pipe0.x < next_pipe0.x: 345 | next_pipe0, next_next_pipe0 = next_next_pipe0, next_pipe0 346 | 347 | ##### 348 | pipes1 = [] 349 | for p in self.pipe_group: 350 | if p.x + p.width/2 > self.player[1].pos_x : 351 | pipes1.append((p, p.x + p.width/2 - self.player[1].pos_x )) 352 | 353 | pipes1.sort(key=lambda p: p[1]) 354 | 355 | next_pipe1 = pipes1[1][0] 356 | next_next_pipe1 = pipes1[0][0] 357 | 358 | if next_next_pipe1.x < next_pipe1.x: 359 | next_pipe1, next_next_pipe1 = next_next_pipe1, next_pipe1 360 | 361 | ##### 362 | pipes2 = [] 363 | for p in self.pipe_group: 364 | if p.x + p.width/2 > self.player[2].pos_x : 365 | pipes2.append((p, p.x + p.width/2 - self.player[2].pos_x )) 366 | 367 | pipes2.sort(key=lambda p: p[1]) 368 | 369 | next_pipe2 = pipes2[1][0] 370 | next_next_pipe2 = pipes2[0][0] 371 | 372 | if next_next_pipe2.x < next_pipe2.x: 373 | next_pipe2, next_next_pipe2 = next_next_pipe2, next_pipe2 374 | 375 | state = { 376 | "player0_y": self.player[0].pos_y, 377 | "player0_vel": self.player[0].vel, 378 | "next_pipe_dist_to_player0": next_pipe0.x + next_pipe0.width/2 - self.player[0].pos_x , 379 | "player0_next_pipe_top_y": next_pipe0.gap_start, 380 | "player0_next_pipe_bottom_y": next_pipe0.gap_start + self.pipe_gap, 381 | "next_next_pipe_dist_to_player0": next_next_pipe0.x + next_next_pipe0.width/2 - self.player[0].pos_x , 382 | "player0_next_next_pipe_top_y": next_next_pipe0.gap_start, 383 | "player0_next_next_pipe_bottom_y": next_next_pipe0.gap_start + self.pipe_gap, 384 | 385 | "player1_y": self.player[1].pos_y, 386 | "player1_vel": self.player[1].vel, 387 | "next_pipe_dist_to_player1": next_pipe1.x + next_pipe1.width/2 - self.player[1].pos_x , 388 | "player1_next_pipe_top_y": next_pipe1.gap_start, 389 | "player1_next_pipe_bottom_y": next_pipe1.gap_start + self.pipe_gap, 390 | "next_next_pipe_dist_to_player1": next_next_pipe1.x + next_next_pipe1.width/2 - self.player[1].pos_x , 391 | "player1_next_next_pipe_top_y": next_next_pipe1.gap_start, 392 | "player1_next_next_pipe_bottom_y": next_next_pipe1.gap_start + self.pipe_gap, 393 | 394 | "player2_y": self.player[2].pos_y, 395 | "player2_vel": self.player[2].vel, 396 | "next_pipe_dist_to_player2": next_pipe2.x + next_pipe2.width/2 - self.player[2].pos_x , 397 | "player2_next_pipe_top_y": next_pipe2.gap_start, 398 | "player2_next_pipe_bottom_y": next_pipe2.gap_start + self.pipe_gap, 399 | "next_next_pipe_dist_to_player2": next_next_pipe2.x + next_next_pipe2.width/2 - self.player[2].pos_x , 400 | "player2_next_next_pipe_top_y": next_next_pipe2.gap_start, 401 | "player2_next_next_pipe_bottom_y": next_next_pipe2.gap_start + self.pipe_gap 402 | } 403 | 404 | return state 405 | 406 | def getScore(self): 407 | return max(max(self.player[0].score,self.player[1].score), self.player[2].score) 408 | 409 | def _generatePipes(self, offset=0, pipe=None): 410 | start_gap = self.rng.random_integers( 411 | self.pipe_min, 412 | self.pipe_max 413 | ) 414 | 415 | if pipe is None: 416 | pipe = Pipe( 417 | self.width, 418 | self.height, 419 | start_gap, 420 | self.pipe_gap, 421 | self.images["pipes"], 422 | self.scale, 423 | color=self.pipe_color, 424 | offset=offset 425 | ) 426 | 427 | return pipe 428 | else: 429 | pipe.init(start_gap, self.pipe_gap, offset, self.pipe_color) 430 | 431 | def _handle_player_events(self): 432 | for event in pygame.event.get(): 433 | if event.type == pygame.QUIT: 434 | pygame.quit() 435 | sys.exit() 436 | 437 | if event.type == pygame.KEYDOWN: 438 | key = event.key 439 | #print("key: ",key) 440 | if key in [1, 3, 5, 7]: 441 | self.player[0].flap() 442 | if key in [2, 3, 6, 7]: 443 | self.player[1].flap() 444 | if key in [4, 5, 6, 7]: 445 | self.player[2].flap() 446 | 447 | 448 | def game_over(self): 449 | return self.player[0].lives + self.player[1].lives + self.player[2].lives <= 0 450 | 451 | def step(self, dt): 452 | self.game_tick += 1 453 | dt = dt / 1000.0 454 | 455 | self.player[0].score += self.rewards["tick"] 456 | self.player[1].score += self.rewards["tick"] 457 | self.player[2].score += self.rewards["tick"] 458 | 459 | # handle player movement 460 | self._handle_player_events() 461 | 462 | for p in self.pipe_group: 463 | if self.player[0].lives > 0: 464 | hit = pygame.sprite.spritecollide( 465 | self.player[0], self.pipe_group, False) 466 | 467 | is_in_pipe = (p.x - p.width/2 - 20) <= self.player[0].pos_x < (p.x + p.width/2) 468 | for h in hit: # do check to see if its within the gap. 469 | top_pipe_check = ( 470 | (self.player[0].pos_y - self.player[0].height/2 + 12) <= h.gap_start) and is_in_pipe 471 | bot_pipe_check = ( 472 | (self.player[0].pos_y + 473 | self.player[0].height) > h.gap_start + 474 | self.pipe_gap) and is_in_pipe 475 | 476 | if top_pipe_check: 477 | self.player[0].lives -= 1 478 | 479 | if bot_pipe_check: 480 | self.player[0].lives -= 1 481 | 482 | # is it past the player? 483 | if (p.x - p.width / 2) <= self.player[0].pos_x < (p.x - p.width / 2 + 4): 484 | self.player[0].score += self.rewards["positive"] 485 | 486 | # is out out of the screen? 487 | if p.x < -p.width: 488 | self._generatePipes(offset=self.width * 0.2, pipe=p) 489 | 490 | ############## 491 | if self.player[1].lives > 0: 492 | hit = pygame.sprite.spritecollide( 493 | self.player[1], self.pipe_group, False) 494 | 495 | is_in_pipe = (p.x - p.width/2 - 20) <= self.player[1].pos_x < (p.x + p.width/2) 496 | for h in hit: # do check to see if its within the gap. 497 | top_pipe_check = ( 498 | (self.player[1].pos_y - self.player[1].height/2 + 12) <= h.gap_start) and is_in_pipe 499 | bot_pipe_check = ( 500 | (self.player[1].pos_y + 501 | self.player[1].height) > h.gap_start + 502 | self.pipe_gap) and is_in_pipe 503 | 504 | if top_pipe_check: 505 | self.player[1].lives -= 1 506 | 507 | if bot_pipe_check: 508 | self.player[1].lives -= 1 509 | 510 | # is it past the player? 511 | if (p.x - p.width / 2) <= self.player[1].pos_x < (p.x - p.width / 2 + 4): 512 | self.player[1].score += self.rewards["positive"] 513 | 514 | # is out out of the screen? 515 | if p.x < -p.width: 516 | self._generatePipes(offset=self.width * 0.2, pipe=p) 517 | 518 | ####################### 519 | if self.player[2].lives > 0: 520 | hit = pygame.sprite.spritecollide( 521 | self.player[2], self.pipe_group, False) 522 | 523 | is_in_pipe = (p.x - p.width/2 - 20) <= self.player[2].pos_x < (p.x + p.width/2) 524 | for h in hit: # do check to see if its within the gap. 525 | top_pipe_check = ( 526 | (self.player[2].pos_y - self.player[2].height/2 + 12) <= h.gap_start) and is_in_pipe 527 | bot_pipe_check = ( 528 | (self.player[2].pos_y + 529 | self.player[2].height) > h.gap_start + 530 | self.pipe_gap) and is_in_pipe 531 | 532 | if top_pipe_check: 533 | self.player[2].lives -= 1 534 | 535 | if bot_pipe_check: 536 | self.player[2].lives -= 1 537 | 538 | # is it past the player? 539 | if (p.x - p.width / 2) <= self.player[2].pos_x < (p.x - p.width / 2 + 4): 540 | self.player[2].score += self.rewards["positive"] 541 | 542 | # is out out of the screen? 543 | if p.x < -p.width: 544 | self._generatePipes(offset=self.width * 0.2, pipe=p) 545 | 546 | # fell on the ground 547 | drawback = True 548 | pgu = True 549 | if self.player[0].lives > 0: 550 | if self.player[0].pos_y >= 0.79 * self.height - self.player[0].height: 551 | self.player[0].lives -= 1 552 | 553 | # went above the screen 554 | if self.player[0].pos_y <= 0: 555 | self.player[0].lives -= 1 556 | 557 | self.player[0].update(dt) 558 | if pgu: 559 | self.pipe_group.update(dt) 560 | pgu = False 561 | 562 | if self.player[0].lives <= 0: 563 | self.player[0].score += self.rewards["loss"] 564 | if drawback: 565 | self.backdrop.draw_background(self.screen) 566 | self.pipe_group.draw(self.screen) 567 | self.backdrop.update_draw_base(self.screen, dt) 568 | drawback = False 569 | self.player[0].draw(self.screen) 570 | 571 | ############# 572 | # fell on the ground 573 | if self.player[1].lives > 0: 574 | if self.player[1].pos_y >= 0.79 * self.height - self.player[1].height: 575 | self.player[1].lives -= 1 576 | 577 | # went above the screen 578 | if self.player[1].pos_y <= 0: 579 | self.player[1].lives -= 1 580 | 581 | self.player[1].update(dt) 582 | if pgu: 583 | self.pipe_group.update(dt) 584 | pgu = False 585 | 586 | if self.player[1].lives <= 0: 587 | self.player[1].score += self.rewards["loss"] 588 | 589 | if drawback: 590 | self.backdrop.draw_background(self.screen) 591 | self.pipe_group.draw(self.screen) 592 | self.backdrop.update_draw_base(self.screen, dt) 593 | drawback = False 594 | self.player[1].draw(self.screen) 595 | 596 | ############## 597 | # fell on the ground 598 | if self.player[2].lives > 0: 599 | if self.player[2].pos_y >= 0.79 * self.height - self.player[2].height: 600 | self.player[2].lives -= 1 601 | 602 | # went above the screen 603 | if self.player[2].pos_y <= 0: 604 | self.player[2].lives -= 1 605 | 606 | self.player[2].update(dt) 607 | if pgu: 608 | self.pipe_group.update(dt) 609 | 610 | if self.player[2].lives <= 0: 611 | self.player[2].score += self.rewards["loss"] 612 | 613 | if drawback: 614 | self.backdrop.draw_background(self.screen) 615 | self.pipe_group.draw(self.screen) 616 | self.backdrop.update_draw_base(self.screen, dt) 617 | self.player[2].draw(self.screen) 618 | -------------------------------------------------------------------------------- /PyGame-Learning-Environment/ple/games/flappybird/assets/blackpaddle-down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/PyGame-Learning-Environment/ple/games/flappybird/assets/blackpaddle-down.png -------------------------------------------------------------------------------- /PyGame-Learning-Environment/ple/games/flappybird/assets/blackpaddle-up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/PyGame-Learning-Environment/ple/games/flappybird/assets/blackpaddle-up.png -------------------------------------------------------------------------------- /PyGame-Learning-Environment/ple/games/flappybird/assets/bluepaddle-down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/PyGame-Learning-Environment/ple/games/flappybird/assets/bluepaddle-down.png -------------------------------------------------------------------------------- /PyGame-Learning-Environment/ple/games/flappybird/assets/bluepaddle-up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/PyGame-Learning-Environment/ple/games/flappybird/assets/bluepaddle-up.png -------------------------------------------------------------------------------- /PyGame-Learning-Environment/ple/games/flappybird/assets/redpaddle-down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/PyGame-Learning-Environment/ple/games/flappybird/assets/redpaddle-down.png -------------------------------------------------------------------------------- /PyGame-Learning-Environment/ple/games/flappybird/assets/redpaddle-up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/PyGame-Learning-Environment/ple/games/flappybird/assets/redpaddle-up.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FlappyPaddle 2 | 3 | ![7](img/7.gif) 4 | 5 | 预览视频看这里: 6 | 7 | [第一届Flappy Paddle大赛](https://www.bilibili.com/video/BV1KV411674k) 从一分五十秒开始比赛 8 | 9 | ## 训练了三个 10 | 11 | 1. 两个hidden layer (modelsm_16400_95.8.ckpt) 12 | 2. 三个hidden layer (model3_8400_1256.2.ckpt) 13 | 3. 三个hidden layer+前两帧的obs直接concat作为输入(modelconcat_30900_157.6.ckpt) 14 | 15 | ## 环境 16 | 17 | 注意parl必须是1.3.1,pygame必须是1.9.6 18 | 19 | 修改了flappy game的init,可以直接跑三个agent了,所以如果你想在环境中评比几个算法现在也是可行的了(如果想要更精简的环境可以参考PeopleVSRL分支) 20 | 21 | 提供三种颜色队伍的图片 22 | 23 | 以上都按照环境本来的文件夹格式提供。 24 | 25 | 26 | 27 | ## 所以你也可以和我一起比,修改run3Agent中添加你的model、algorithm、agent,你一定可以看懂。 28 | 29 | 30 | 31 | ## 【BUG】: 32 | 33 | ​ 不知道啥情况,第一个agent操作了第二个队伍,第二个agent操作第一个队伍,还在排查中 34 | 35 | ​ 【20200702】不是bug,是pygame或opencv的图像通道不统一的问题,两者的RGB顺序应该不一样。 36 | 37 | 38 | 39 | ## 第一届paddlepaddle杯Flappy Paddle大赛 40 | 41 | 最后的比赛成绩红队143分,黑队125分,蓝队1003分 42 | 43 | ![1](img/1.png) 44 | 45 | ![2](img/2.png) 46 | 47 | ![3](img/3.png) 48 | 49 | ![4](img/4.png) 50 | 51 | ![6](img/6.png) 52 | 53 | ![5](img/5.png) 54 | 55 | ## 分支PeopleVSRL是支持用户操作和机器比赛的模式 56 | 57 | 58 | ## 许可证书 59 | 本项目的发布受Apache 2.0 license许可认证。 60 | -------------------------------------------------------------------------------- /dqn2.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.putenv('SDL_VIDEODRIVER', 'fbcon') 3 | os.environ["SDL_VIDEODRIVER"] = "dummy" 4 | 5 | from ple.games.flappybird import FlappyBird 6 | from ple import PLE 7 | import parl 8 | from parl import layers 9 | import paddle.fluid as fluid 10 | import copy 11 | import numpy as np 12 | import os 13 | import gym 14 | from parl.utils import logger 15 | from datetime import datetime 16 | import cv2 17 | 18 | LEARN_FREQ = 5 # 训练频率,不需要每一个step都learn,攒一些新增经验后再learn,提高效率 19 | MEMORY_SIZE = 200000 # replay memory的大小,越大越占用内存 20 | MEMORY_WARMUP_SIZE = 200 # replay_memory 里需要预存一些经验数据,再开启训练 21 | BATCH_SIZE = 32 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来 22 | LEARNING_RATE = 0.001 # 学习率 23 | GAMMA = 0.99 24 | 25 | class Model(parl.Model): 26 | def __init__(self, act_dim): 27 | #hid0_size = 64 28 | hid1_size = 32 29 | hid2_size = 16 30 | # 3层全连接网络 31 | #self.fc0 = layers.fc(size=hid0_size, act='relu', name="sfc0") 32 | self.fc1 = layers.fc(size=hid1_size, act='relu', name="sfc1") 33 | self.fc2 = layers.fc(size=hid2_size, act='relu', name="sfc2") 34 | self.fc3 = layers.fc(size=act_dim, act=None, name="sfc3") 35 | 36 | def value(self, obs): 37 | # 定义网络 38 | # 输入state,输出所有action对应的Q,[Q(s,a1), Q(s,a2), Q(s,a3)...] 39 | #h0 = self.fc0(obs) 40 | h1 = self.fc1(obs) 41 | h2 = self.fc2(h1) 42 | Q = self.fc3(h2) 43 | return Q 44 | 45 | class DQN(parl.Algorithm): 46 | def __init__(self, model, act_dim=None, gamma=None, lr=None): 47 | """ DQN algorithm 48 | 49 | Args: 50 | model (parl.Model): 定义Q函数的前向网络结构 51 | act_dim (int): action空间的维度,即有几个action 52 | gamma (float): reward的衰减因子 53 | lr (float): learning rate 学习率. 54 | """ 55 | self.model = model 56 | self.target_model = copy.deepcopy(model) 57 | 58 | assert isinstance(act_dim, int) 59 | assert isinstance(gamma, float) 60 | assert isinstance(lr, float) 61 | self.act_dim = act_dim 62 | self.gamma = gamma 63 | self.lr = lr 64 | 65 | def predict(self, obs): 66 | """ 使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...] 67 | """ 68 | return self.model.value(obs) 69 | 70 | def learn(self, obs, action, reward, next_obs, terminal): 71 | """ 使用DQN算法更新self.model的value网络 72 | """ 73 | # 从target_model中获取 max Q' 的值,用于计算target_Q 74 | next_pred_value = self.target_model.value(next_obs) 75 | best_v = layers.reduce_max(next_pred_value, dim=1) 76 | best_v.stop_gradient = True # 阻止梯度传递 77 | terminal = layers.cast(terminal, dtype='float32') 78 | target = reward + (1.0 - terminal) * self.gamma * best_v 79 | 80 | pred_value = self.model.value(obs) # 获取Q预测值 81 | # 将action转onehot向量,比如:3 => [0,0,0,1,0] 82 | action_onehot = layers.one_hot(action, self.act_dim) 83 | action_onehot = layers.cast(action_onehot, dtype='float32') 84 | # 下面一行是逐元素相乘,拿到action对应的 Q(s,a) 85 | # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]] 86 | # ==> pred_action_value = [[3.9]] 87 | pred_action_value = layers.reduce_sum( 88 | layers.elementwise_mul(action_onehot, pred_value), dim=1) 89 | 90 | # 计算 Q(s,a) 与 target_Q的均方差,得到loss 91 | cost = layers.square_error_cost(pred_action_value, target) 92 | cost = layers.reduce_mean(cost) 93 | optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器 94 | optimizer.minimize(cost) 95 | return cost 96 | 97 | def sync_target(self): 98 | """ 把 self.model 的模型参数值同步到 self.target_model 99 | """ 100 | self.model.sync_weights_to(self.target_model) 101 | 102 | class Agent(parl.Agent): 103 | def __init__(self, 104 | algorithm, 105 | obs_dim, 106 | act_dim, 107 | e_greed=0.1, 108 | e_greed_decrement=0): 109 | assert isinstance(obs_dim, int) 110 | assert isinstance(act_dim, int) 111 | self.obs_dim = obs_dim 112 | self.act_dim = act_dim 113 | super(Agent, self).__init__(algorithm) 114 | 115 | self.global_step = 0 116 | self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中 117 | 118 | self.e_greed = e_greed # 有一定概率随机选取动作,探索 119 | self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低 120 | 121 | def build_program(self): 122 | self.pred_program = fluid.Program() 123 | self.learn_program = fluid.Program() 124 | 125 | with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 126 | obs = layers.data( 127 | name='obs', shape=[self.obs_dim], dtype='float32') 128 | self.value = self.alg.predict(obs) 129 | 130 | with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量 131 | obs = layers.data( 132 | name='obs', shape=[self.obs_dim], dtype='float32') 133 | action = layers.data(name='act', shape=[1], dtype='int32') 134 | reward = layers.data(name='reward', shape=[], dtype='float32') 135 | next_obs = layers.data( 136 | name='next_obs', shape=[self.obs_dim], dtype='float32') 137 | terminal = layers.data(name='terminal', shape=[], dtype='bool') 138 | self.cost = self.alg.learn(obs, action, reward, next_obs, terminal) 139 | 140 | def sample(self, obs): 141 | sample = np.random.rand() # 产生0~1之间的小数 142 | if sample < self.e_greed: 143 | act = np.random.randint(self.act_dim) 144 | #act = 0 # 探索:每个动作都有概率被选择 145 | else: 146 | act = self.predict(obs) # 选择最优动作 147 | self.e_greed = max( 148 | 0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低 149 | return act 150 | 151 | def predict(self, obs): # 选择最优动作 152 | obs = np.expand_dims(obs, axis=0) 153 | pred_Q = self.fluid_executor.run( 154 | self.pred_program, 155 | feed={'obs': obs.astype('float32')}, 156 | fetch_list=[self.value])[0] 157 | pred_Q = np.squeeze(pred_Q, axis=0) 158 | act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作 159 | return act 160 | 161 | def learn(self, obs, act, reward, next_obs, terminal): 162 | # 每隔200个training steps同步一次model和target_model的参数 163 | if self.global_step % self.update_target_steps == 0: 164 | self.alg.sync_target() 165 | self.global_step += 1 166 | 167 | act = np.expand_dims(act, -1) 168 | feed = { 169 | 'obs': obs.astype('float32'), 170 | 'act': act.astype('int32'), 171 | 'reward': reward, 172 | 'next_obs': next_obs.astype('float32'), 173 | 'terminal': terminal 174 | } 175 | cost = self.fluid_executor.run( 176 | self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络 177 | return cost 178 | 179 | import random 180 | import collections 181 | import numpy as np 182 | 183 | 184 | class ReplayMemory(object): 185 | def __init__(self, max_size): 186 | self.buffer = collections.deque(maxlen=max_size) 187 | 188 | # 增加一条经验到经验池中 189 | def append(self, exp): 190 | self.buffer.append(exp) 191 | 192 | # 从经验池中选取N条经验出来 193 | def sample(self, batch_size): 194 | mini_batch = random.sample(self.buffer, batch_size) 195 | obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], [] 196 | 197 | for experience in mini_batch: 198 | s, a, r, s_p, done = experience 199 | obs_batch.append(s) 200 | action_batch.append(a) 201 | reward_batch.append(r) 202 | next_obs_batch.append(s_p) 203 | done_batch.append(done) 204 | 205 | return np.array(obs_batch).astype('float32'), \ 206 | np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\ 207 | np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32') 208 | 209 | def __len__(self): 210 | return len(self.buffer) 211 | 212 | totale = 0 213 | 214 | def run_episode(env, agent, rpm): 215 | actionset = env.getActionSet() 216 | global totale 217 | # print(totale) 218 | totale += 1 219 | total_reward = 0 220 | env.init() 221 | env.reset_game() 222 | obs = list(env.getGameState().values()) 223 | step = 0 224 | while True: 225 | step += 1 226 | action = agent.sample(obs) # 采样动作,所有动作都有概率被尝试到 227 | #print(action," ", end="") 228 | reward = env.act(actionset[action]) 229 | next_obs = list(env.getGameState().values()) 230 | done = env.game_over() 231 | rpm.append((obs, action, reward, next_obs, done)) 232 | 233 | # train model 234 | if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0): 235 | (batch_obs, batch_action, batch_reward, batch_next_obs, 236 | batch_done) = rpm.sample(BATCH_SIZE) 237 | train_loss = agent.learn(batch_obs, batch_action, batch_reward, 238 | batch_next_obs, 239 | batch_done) # s,a,r,s',done 240 | 241 | total_reward += reward 242 | obs = next_obs 243 | if done: 244 | break 245 | #print() 246 | return total_reward 247 | 248 | 249 | # 评估 agent, 跑 5 个episode,总reward求平均 250 | def evaluate(agent): 251 | env = PLE(game, fps=30, display_screen=True) 252 | actionset = env.getActionSet() 253 | eval_reward = [] 254 | for i in range(5): 255 | env.init() 256 | env.reset_game() 257 | obs = list(env.getGameState().values()) 258 | episode_reward = 0 259 | while True: 260 | action = agent.predict(obs) 261 | observation = env.getScreenRGB() 262 | score = env.score() 263 | #action = agent.pickAction(reward, observation) 264 | observation = cv2.transpose(observation) 265 | font = cv2.FONT_HERSHEY_SIMPLEX 266 | observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) 267 | cv2.imshow("ss", observation) 268 | cv2.waitKey(10) # 预测动作,只选最优动作 269 | reward= env.act(actionset[action]) 270 | obs = list(env.getGameState().values()) 271 | done = env.game_over() 272 | episode_reward += reward 273 | if done: 274 | break 275 | eval_reward.append(episode_reward) 276 | cv2.destroyAllWindows() 277 | return np.mean(eval_reward) 278 | 279 | 280 | game = FlappyBird() 281 | env = PLE(game, fps=30, display_screen=False) # CartPole-v0: 预期最后一次评估总分 > 180(最大值是200) 282 | action_dim = len(env.getActionSet()) # CartPole-v0: 2 283 | obs_shape = len(env.getGameState()) # CartPole-v0: (4,) 284 | 285 | rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 286 | 287 | # 根据parl框架构建agent 288 | model = Model(act_dim=action_dim) 289 | algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) 290 | agent = Agent( 291 | algorithm, 292 | obs_dim=obs_shape, 293 | act_dim=action_dim, 294 | e_greed=0.1, # 有一定概率随机选取动作,探索 295 | e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 296 | 297 | # 加载模型 298 | # save_path = './dqn_model.ckpt' 299 | # agent.restore(save_path) 300 | 301 | # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 302 | while len(rpm) < MEMORY_WARMUP_SIZE: 303 | run_episode(env, agent, rpm) 304 | 305 | max_episode = 20000000 306 | 307 | # 开始训练 308 | episode = 0 309 | 310 | ps = datetime.now() 311 | 312 | evmax = 0 313 | 314 | while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 315 | # train part 316 | start = datetime.now() 317 | for i in range(0, 100): 318 | total_reward = run_episode(env, agent, rpm) 319 | episode += 1 320 | end = datetime.now() 321 | # test part 322 | eval_reward = evaluate(agent) # render=True 查看显示效果 323 | logger.info('episode:{} time:{} e_greed:{} test_reward:{}'.format( 324 | episode, (end-start).seconds, agent.e_greed, eval_reward)) 325 | 326 | # 训练结束,保存模型 327 | if eval_reward > evmax: 328 | save_path = './modelsm_' + str(episode) + '_' + str(eval_reward) + '.ckpt' 329 | agent.save(save_path) 330 | evmax = eval_reward -------------------------------------------------------------------------------- /dqn3.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.putenv('SDL_VIDEODRIVER', 'fbcon') 3 | os.environ["SDL_VIDEODRIVER"] = "dummy" 4 | 5 | from ple.games.flappybird import FlappyBird 6 | from ple import PLE 7 | import parl 8 | from parl import layers 9 | import paddle.fluid as fluid 10 | import copy 11 | import numpy as np 12 | import os 13 | import gym 14 | from parl.utils import logger 15 | from datetime import datetime 16 | import cv2 17 | 18 | LEARN_FREQ = 5 # 训练频率,不需要每一个step都learn,攒一些新增经验后再learn,提高效率 19 | MEMORY_SIZE = 200000 # replay memory的大小,越大越占用内存 20 | MEMORY_WARMUP_SIZE = 200 # replay_memory 里需要预存一些经验数据,再开启训练 21 | BATCH_SIZE = 32 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来 22 | LEARNING_RATE = 0.001 # 学习率 23 | GAMMA = 0.99 24 | 25 | class Model(parl.Model): 26 | def __init__(self, act_dim): 27 | hid0_size = 64 28 | hid1_size = 32 29 | hid2_size = 16 30 | # 3层全连接网络 31 | self.fc0 = layers.fc(size=hid0_size, act='relu', name="fc0") 32 | self.fc1 = layers.fc(size=hid1_size, act='relu', name="fc1") 33 | self.fc2 = layers.fc(size=hid2_size, act='relu', name="fc2") 34 | self.fc3 = layers.fc(size=act_dim, act=None, name="fc3") 35 | 36 | def value(self, obs): 37 | # 定义网络 38 | # 输入state,输出所有action对应的Q,[Q(s,a1), Q(s,a2), Q(s,a3)...] 39 | h0 = self.fc0(obs) 40 | h1 = self.fc1(h0) 41 | h2 = self.fc2(h1) 42 | Q = self.fc3(h2) 43 | return Q 44 | 45 | class DQN(parl.Algorithm): 46 | def __init__(self, model, act_dim=None, gamma=None, lr=None): 47 | """ DQN algorithm 48 | 49 | Args: 50 | model (parl.Model): 定义Q函数的前向网络结构 51 | act_dim (int): action空间的维度,即有几个action 52 | gamma (float): reward的衰减因子 53 | lr (float): learning rate 学习率. 54 | """ 55 | self.model = model 56 | self.target_model = copy.deepcopy(model) 57 | 58 | assert isinstance(act_dim, int) 59 | assert isinstance(gamma, float) 60 | assert isinstance(lr, float) 61 | self.act_dim = act_dim 62 | self.gamma = gamma 63 | self.lr = lr 64 | 65 | def predict(self, obs): 66 | """ 使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...] 67 | """ 68 | return self.model.value(obs) 69 | 70 | def learn(self, obs, action, reward, next_obs, terminal): 71 | """ 使用DQN算法更新self.model的value网络 72 | """ 73 | # 从target_model中获取 max Q' 的值,用于计算target_Q 74 | next_pred_value = self.target_model.value(next_obs) 75 | best_v = layers.reduce_max(next_pred_value, dim=1) 76 | best_v.stop_gradient = True # 阻止梯度传递 77 | terminal = layers.cast(terminal, dtype='float32') 78 | target = reward + (1.0 - terminal) * self.gamma * best_v 79 | 80 | pred_value = self.model.value(obs) # 获取Q预测值 81 | # 将action转onehot向量,比如:3 => [0,0,0,1,0] 82 | action_onehot = layers.one_hot(action, self.act_dim) 83 | action_onehot = layers.cast(action_onehot, dtype='float32') 84 | # 下面一行是逐元素相乘,拿到action对应的 Q(s,a) 85 | # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]] 86 | # ==> pred_action_value = [[3.9]] 87 | pred_action_value = layers.reduce_sum( 88 | layers.elementwise_mul(action_onehot, pred_value), dim=1) 89 | 90 | # 计算 Q(s,a) 与 target_Q的均方差,得到loss 91 | cost = layers.square_error_cost(pred_action_value, target) 92 | cost = layers.reduce_mean(cost) 93 | optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器 94 | optimizer.minimize(cost) 95 | return cost 96 | 97 | def sync_target(self): 98 | """ 把 self.model 的模型参数值同步到 self.target_model 99 | """ 100 | self.model.sync_weights_to(self.target_model) 101 | 102 | class Agent(parl.Agent): 103 | def __init__(self, 104 | algorithm, 105 | obs_dim, 106 | act_dim, 107 | e_greed=0.1, 108 | e_greed_decrement=0): 109 | assert isinstance(obs_dim, int) 110 | assert isinstance(act_dim, int) 111 | self.obs_dim = obs_dim 112 | self.act_dim = act_dim 113 | super(Agent, self).__init__(algorithm) 114 | 115 | self.global_step = 0 116 | self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中 117 | 118 | self.e_greed = e_greed # 有一定概率随机选取动作,探索 119 | self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低 120 | 121 | def build_program(self): 122 | self.pred_program = fluid.Program() 123 | self.learn_program = fluid.Program() 124 | 125 | with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 126 | obs = layers.data( 127 | name='obs', shape=[self.obs_dim], dtype='float32') 128 | self.value = self.alg.predict(obs) 129 | 130 | with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量 131 | obs = layers.data( 132 | name='obs', shape=[self.obs_dim], dtype='float32') 133 | action = layers.data(name='act', shape=[1], dtype='int32') 134 | reward = layers.data(name='reward', shape=[], dtype='float32') 135 | next_obs = layers.data( 136 | name='next_obs', shape=[self.obs_dim], dtype='float32') 137 | terminal = layers.data(name='terminal', shape=[], dtype='bool') 138 | self.cost = self.alg.learn(obs, action, reward, next_obs, terminal) 139 | 140 | def sample(self, obs): 141 | sample = np.random.rand() # 产生0~1之间的小数 142 | if sample < self.e_greed: 143 | act = np.random.randint(self.act_dim) 144 | #act = 0 # 探索:每个动作都有概率被选择 145 | else: 146 | act = self.predict(obs) # 选择最优动作 147 | self.e_greed = max( 148 | 0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低 149 | return act 150 | 151 | def predict(self, obs): # 选择最优动作 152 | obs = np.expand_dims(obs, axis=0) 153 | pred_Q = self.fluid_executor.run( 154 | self.pred_program, 155 | feed={'obs': obs.astype('float32')}, 156 | fetch_list=[self.value])[0] 157 | pred_Q = np.squeeze(pred_Q, axis=0) 158 | act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作 159 | return act 160 | 161 | def learn(self, obs, act, reward, next_obs, terminal): 162 | # 每隔200个training steps同步一次model和target_model的参数 163 | if self.global_step % self.update_target_steps == 0: 164 | self.alg.sync_target() 165 | self.global_step += 1 166 | 167 | act = np.expand_dims(act, -1) 168 | feed = { 169 | 'obs': obs.astype('float32'), 170 | 'act': act.astype('int32'), 171 | 'reward': reward, 172 | 'next_obs': next_obs.astype('float32'), 173 | 'terminal': terminal 174 | } 175 | cost = self.fluid_executor.run( 176 | self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络 177 | return cost 178 | 179 | import random 180 | import collections 181 | import numpy as np 182 | 183 | 184 | class ReplayMemory(object): 185 | def __init__(self, max_size): 186 | self.buffer = collections.deque(maxlen=max_size) 187 | 188 | # 增加一条经验到经验池中 189 | def append(self, exp): 190 | self.buffer.append(exp) 191 | 192 | # 从经验池中选取N条经验出来 193 | def sample(self, batch_size): 194 | mini_batch = random.sample(self.buffer, batch_size) 195 | obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], [] 196 | 197 | for experience in mini_batch: 198 | s, a, r, s_p, done = experience 199 | obs_batch.append(s) 200 | action_batch.append(a) 201 | reward_batch.append(r) 202 | next_obs_batch.append(s_p) 203 | done_batch.append(done) 204 | 205 | return np.array(obs_batch).astype('float32'), \ 206 | np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\ 207 | np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32') 208 | 209 | def __len__(self): 210 | return len(self.buffer) 211 | 212 | totale = 0 213 | 214 | def run_episode(env, agent, rpm): 215 | actionset = env.getActionSet() 216 | global totale 217 | # print(totale) 218 | totale += 1 219 | total_reward = 0 220 | env.init() 221 | env.reset_game() 222 | obs = list(env.getGameState().values()) 223 | step = 0 224 | while True: 225 | step += 1 226 | action = agent.sample(obs) # 采样动作,所有动作都有概率被尝试到 227 | #print(action," ", end="") 228 | reward = env.act(actionset[action]) 229 | next_obs = list(env.getGameState().values()) 230 | done = env.game_over() 231 | rpm.append((obs, action, reward, next_obs, done)) 232 | 233 | # train model 234 | if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0): 235 | (batch_obs, batch_action, batch_reward, batch_next_obs, 236 | batch_done) = rpm.sample(BATCH_SIZE) 237 | train_loss = agent.learn(batch_obs, batch_action, batch_reward, 238 | batch_next_obs, 239 | batch_done) # s,a,r,s',done 240 | 241 | total_reward += reward 242 | obs = next_obs 243 | if done: 244 | break 245 | #print() 246 | return total_reward 247 | 248 | 249 | # 评估 agent, 跑 5 个episode,总reward求平均 250 | def evaluate(agent): 251 | env = PLE(game, fps=30, display_screen=True) 252 | actionset = env.getActionSet() 253 | eval_reward = [] 254 | for i in range(5): 255 | env.init() 256 | env.reset_game() 257 | obs = list(env.getGameState().values()) 258 | episode_reward = 0 259 | while True: 260 | action = agent.predict(obs) 261 | observation = env.getScreenRGB() 262 | score = env.score() 263 | #action = agent.pickAction(reward, observation) 264 | observation = cv2.transpose(observation) 265 | font = cv2.FONT_HERSHEY_SIMPLEX 266 | observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) 267 | cv2.imshow("ss", observation) 268 | cv2.waitKey(10) # 预测动作,只选最优动作 269 | reward= env.act(actionset[action]) 270 | obs = list(env.getGameState().values()) 271 | done = env.game_over() 272 | episode_reward += reward 273 | if done: 274 | break 275 | eval_reward.append(episode_reward) 276 | cv2.destroyAllWindows() 277 | return np.mean(eval_reward) 278 | 279 | 280 | game = FlappyBird() 281 | env = PLE(game, fps=30, display_screen=False) # CartPole-v0: 预期最后一次评估总分 > 180(最大值是200) 282 | action_dim = len(env.getActionSet()) # CartPole-v0: 2 283 | obs_shape = len(env.getGameState()) # CartPole-v0: (4,) 284 | 285 | rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 286 | 287 | # 根据parl框架构建agent 288 | model = Model(act_dim=action_dim) 289 | algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) 290 | agent = Agent( 291 | algorithm, 292 | obs_dim=obs_shape, 293 | act_dim=action_dim, 294 | e_greed=0.1, # 有一定概率随机选取动作,探索 295 | e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 296 | 297 | # 加载模型 298 | # save_path = './dqn_model.ckpt' 299 | # agent.restore(save_path) 300 | 301 | # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 302 | while len(rpm) < MEMORY_WARMUP_SIZE: 303 | run_episode(env, agent, rpm) 304 | 305 | max_episode = 20000000 306 | 307 | # 开始训练 308 | episode = 0 309 | 310 | ps = datetime.now() 311 | 312 | evmax = 0 313 | 314 | while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 315 | # train part 316 | start = datetime.now() 317 | for i in range(0, 100): 318 | total_reward = run_episode(env, agent, rpm) 319 | episode += 1 320 | end = datetime.now() 321 | # test part 322 | eval_reward = evaluate(agent) # render=True 查看显示效果 323 | logger.info('episode:{} time:{} e_greed:{} test_reward:{}'.format( 324 | episode, (end-start).seconds, agent.e_greed, eval_reward)) 325 | 326 | # 训练结束,保存模型 327 | if eval_reward > evmax: 328 | save_path = './model_' + str(episode) + '_' + str(eval_reward) + '.ckpt' 329 | agent.save(save_path) 330 | evmax = eval_reward -------------------------------------------------------------------------------- /dqnconcat.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.putenv('SDL_VIDEODRIVER', 'fbcon') 3 | os.environ["SDL_VIDEODRIVER"] = "dummy" 4 | 5 | from ple.games.flappybird import FlappyBird 6 | from ple import PLE 7 | import parl 8 | from parl import layers 9 | import paddle.fluid as fluid 10 | import copy 11 | import numpy as np 12 | import os 13 | import gym 14 | from parl.utils import logger 15 | from datetime import datetime 16 | import cv2 17 | 18 | LEARN_FREQ = 5 # 训练频率,不需要每一个step都learn,攒一些新增经验后再learn,提高效率 19 | MEMORY_SIZE = 200000 # replay memory的大小,越大越占用内存 20 | MEMORY_WARMUP_SIZE = 200 # replay_memory 里需要预存一些经验数据,再开启训练 21 | BATCH_SIZE = 32 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来 22 | LEARNING_RATE = 0.001 # 学习率 23 | GAMMA = 0.99 24 | 25 | class Model(parl.Model): 26 | def __init__(self, act_dim): 27 | hid0_size = 64 28 | hid1_size = 32 29 | hid2_size = 16 30 | # 3层全连接网络 31 | self.fc0 = layers.fc(size=hid0_size, act='relu', name="catfc0") 32 | self.fc1 = layers.fc(size=hid1_size, act='relu', name="catfc1") 33 | self.fc2 = layers.fc(size=hid2_size, act='relu', name="catfc2") 34 | self.fc3 = layers.fc(size=act_dim, act=None, name="catfc3") 35 | 36 | def value(self, last_obs, obs): 37 | # 定义网络 38 | # 输入state,输出所有action对应的Q,[Q(s,a1), Q(s,a2), Q(s,a3)...] 39 | # oobs = np.concatenate(last_obs, obs) 40 | # print(obs.numpy()) 41 | oobs = fluid.layers.concat(input=[last_obs, obs], axis=-1, name='concat') 42 | 43 | h0 = self.fc0(oobs) 44 | h1 = self.fc1(h0) 45 | h2 = self.fc2(h1) 46 | Q = self.fc3(h2) 47 | return Q 48 | 49 | class DQN(parl.Algorithm): 50 | def __init__(self, model, act_dim=None, gamma=None, lr=None): 51 | """ DQN algorithm 52 | 53 | Args: 54 | model (parl.Model): 定义Q函数的前向网络结构 55 | act_dim (int): action空间的维度,即有几个action 56 | gamma (float): reward的衰减因子 57 | lr (float): learning rate 学习率. 58 | """ 59 | self.model = model 60 | self.target_model = copy.deepcopy(model) 61 | 62 | assert isinstance(act_dim, int) 63 | assert isinstance(gamma, float) 64 | assert isinstance(lr, float) 65 | self.act_dim = act_dim 66 | self.gamma = gamma 67 | self.lr = lr 68 | 69 | def predict(self, last, obs): 70 | """ 使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...] 71 | """ 72 | return self.model.value(last, obs) 73 | 74 | def learn(self, last_obs, obs, action, reward, next_obs, terminal): 75 | """ 使用DQN算法更新self.model的value网络 76 | """ 77 | # 从target_model中获取 max Q' 的值,用于计算target_Q 78 | next_pred_value = self.target_model.value(obs, next_obs) 79 | best_v = layers.reduce_max(next_pred_value, dim=1) 80 | best_v.stop_gradient = True # 阻止梯度传递 81 | terminal = layers.cast(terminal, dtype='float32') 82 | target = reward + (1.0 - terminal) * self.gamma * best_v 83 | 84 | pred_value = self.model.value(last_obs, obs) # 获取Q预测值 85 | # 将action转onehot向量,比如:3 => [0,0,0,1,0] 86 | action_onehot = layers.one_hot(action, self.act_dim) 87 | action_onehot = layers.cast(action_onehot, dtype='float32') 88 | # 下面一行是逐元素相乘,拿到action对应的 Q(s,a) 89 | # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]] 90 | # ==> pred_action_value = [[3.9]] 91 | pred_action_value = layers.reduce_sum( 92 | layers.elementwise_mul(action_onehot, pred_value), dim=1) 93 | 94 | # 计算 Q(s,a) 与 target_Q的均方差,得到loss 95 | cost = layers.square_error_cost(pred_action_value, target) 96 | cost = layers.reduce_mean(cost) 97 | optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器 98 | optimizer.minimize(cost) 99 | return cost 100 | 101 | def sync_target(self): 102 | """ 把 self.model 的模型参数值同步到 self.target_model 103 | """ 104 | self.model.sync_weights_to(self.target_model) 105 | 106 | class Agent(parl.Agent): 107 | def __init__(self, 108 | algorithm, 109 | obs_dim, 110 | act_dim, 111 | e_greed=0.1, 112 | e_greed_decrement=0): 113 | assert isinstance(obs_dim, int) 114 | assert isinstance(act_dim, int) 115 | self.obs_dim = obs_dim 116 | self.act_dim = act_dim 117 | super(Agent, self).__init__(algorithm) 118 | 119 | self.global_step = 0 120 | self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中 121 | 122 | self.e_greed = e_greed # 有一定概率随机选取动作,探索 123 | self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低 124 | 125 | def build_program(self): 126 | self.pred_program = fluid.Program() 127 | self.learn_program = fluid.Program() 128 | 129 | with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 130 | last_obs = layers.data( 131 | name='last_obs', shape=[self.obs_dim], dtype='float32') 132 | obs = layers.data( 133 | name='obs', shape=[self.obs_dim], dtype='float32') 134 | self.value = self.alg.predict(last_obs, obs) 135 | 136 | with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量 137 | last_obs = layers.data( 138 | name='last_obs', shape=[self.obs_dim], dtype='float32') 139 | obs = layers.data( 140 | name='obs', shape=[self.obs_dim], dtype='float32') 141 | action = layers.data(name='act', shape=[1], dtype='int32') 142 | reward = layers.data(name='reward', shape=[], dtype='float32') 143 | next_obs = layers.data( 144 | name='next_obs', shape=[self.obs_dim], dtype='float32') 145 | terminal = layers.data(name='terminal', shape=[], dtype='bool') 146 | self.cost = self.alg.learn(last_obs, obs, action, reward, next_obs, terminal) 147 | 148 | def sample(self, last_obs, obs): 149 | sample = np.random.rand() # 产生0~1之间的小数 150 | if sample < self.e_greed: 151 | act = np.random.randint(self.act_dim) 152 | #act = 0 # 探索:每个动作都有概率被选择 153 | else: 154 | act = self.predict(last_obs, obs) # 选择最优动作 155 | self.e_greed = max( 156 | 0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低 157 | return act 158 | 159 | def predict(self, last_obs, obs): # 选择最优动作 160 | obs = np.expand_dims(obs, axis=0) 161 | last_obs = np.expand_dims(last_obs, axis=0) 162 | pred_Q = self.fluid_executor.run( 163 | self.pred_program, 164 | feed={ 165 | 'obs': obs.astype('float32'), 166 | 'last_obs': last_obs.astype('float32') 167 | }, 168 | fetch_list=[self.value])[0] 169 | pred_Q = np.squeeze(pred_Q, axis=0) 170 | act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作 171 | return act 172 | 173 | def learn(self, last_obs, obs, act, reward, next_obs, terminal): 174 | # 每隔200个training steps同步一次model和target_model的参数 175 | if self.global_step % self.update_target_steps == 0: 176 | self.alg.sync_target() 177 | self.global_step += 1 178 | 179 | act = np.expand_dims(act, -1) 180 | feed = { 181 | 'last_obs': last_obs.astype('float32'), 182 | 'obs': obs.astype('float32'), 183 | 'act': act.astype('int32'), 184 | 'reward': reward, 185 | 'next_obs': next_obs.astype('float32'), 186 | 'terminal': terminal 187 | } 188 | cost = self.fluid_executor.run( 189 | self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络 190 | return cost 191 | 192 | import random 193 | import collections 194 | import numpy as np 195 | 196 | 197 | class ReplayMemory(object): 198 | def __init__(self, max_size): 199 | self.buffer = collections.deque(maxlen=max_size) 200 | 201 | # 增加一条经验到经验池中 202 | def append(self, exp): 203 | self.buffer.append(exp) 204 | 205 | # 从经验池中选取N条经验出来 206 | def sample(self, batch_size): 207 | mini_batch = random.sample(self.buffer, batch_size) 208 | lo_batch, obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], [], [] 209 | 210 | for experience in mini_batch: 211 | l, s, a, r, s_p, done = experience 212 | lo_batch.append(l) 213 | obs_batch.append(s) 214 | action_batch.append(a) 215 | reward_batch.append(r) 216 | next_obs_batch.append(s_p) 217 | done_batch.append(done) 218 | 219 | return np.array(lo_batch).astype('float32'), np.array(obs_batch).astype('float32'), \ 220 | np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\ 221 | np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32') 222 | 223 | def __len__(self): 224 | return len(self.buffer) 225 | 226 | totale = 0 227 | 228 | def run_episode(env, agent, rpm): 229 | actionset = env.getActionSet() 230 | global totale 231 | # print(totale) 232 | totale += 1 233 | total_reward = 0 234 | env.init() 235 | env.reset_game() 236 | obs = list(env.getGameState().values()) 237 | #print(obs) 238 | step = 0 239 | last_obs = np.zeros_like(obs) 240 | while True: 241 | step += 1 242 | # print(last_obs) 243 | # print(obs) 244 | # print(np.concatenate([last_obs, obs], axis=0)) 245 | # input() 246 | action = agent.sample(last_obs, obs) # 采样动作,所有动作都有概率被尝试到 247 | #print(action," ", end="") 248 | 249 | reward = env.act(actionset[action]) 250 | next_obs = list(env.getGameState().values()) 251 | done = env.game_over() 252 | rpm.append((last_obs, obs, action, reward, next_obs, done)) 253 | 254 | # train model 255 | if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0): 256 | (batch_last, batch_obs, batch_action, batch_reward, batch_next_obs, 257 | batch_done) = rpm.sample(BATCH_SIZE) 258 | train_loss = agent.learn(batch_last, batch_obs, batch_action, batch_reward, 259 | batch_next_obs, 260 | batch_done) # s,a,r,s',done 261 | 262 | total_reward += reward 263 | last_obs = obs 264 | obs = next_obs 265 | if done: 266 | break 267 | #print() 268 | return total_reward 269 | 270 | 271 | # 评估 agent, 跑 5 个episode,总reward求平均 272 | def evaluate(agent): 273 | env = PLE(game, fps=30, display_screen=True) 274 | actionset = env.getActionSet() 275 | eval_reward = [] 276 | for i in range(5): 277 | env.init() 278 | env.reset_game() 279 | obs = list(env.getGameState().values()) 280 | episode_reward = 0 281 | last_obs = np.zeros_like(obs) 282 | while True: 283 | action = agent.predict(last_obs, obs) 284 | observation = env.getScreenRGB() 285 | score = env.score() 286 | # print(score) 287 | observation = cv2.transpose(observation) 288 | font = cv2.FONT_HERSHEY_SIMPLEX 289 | observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) 290 | cv2.imshow("ss", observation) 291 | cv2.waitKey(5) # 预测动作,只选最优动作 292 | reward = env.act(actionset[action]) 293 | last_obs = obs 294 | obs = list(env.getGameState().values()) 295 | done = env.game_over() 296 | episode_reward += reward 297 | if done: 298 | break 299 | eval_reward.append(episode_reward) 300 | cv2.destroyAllWindows() 301 | return np.mean(eval_reward) 302 | 303 | game = FlappyBird() 304 | env = PLE(game, fps=30, display_screen=False) # CartPole-v0: 预期最后一次评估总分 > 180(最大值是200) 305 | 306 | action_dim = len(env.getActionSet()) # CartPole-v0: 2 307 | obs_shape = len(env.getGameState()) # CartPole-v0: (4,) 308 | 309 | rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 310 | # 根据parl框架构建agent 311 | model = Model(act_dim=action_dim) 312 | 313 | algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) 314 | agent = Agent( 315 | algorithm, 316 | obs_dim=obs_shape, 317 | act_dim=action_dim, 318 | e_greed=0.1, # 有一定概率随机选取动作,探索 319 | e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 320 | 321 | # 加载模型 322 | # save_path = './dqn_model.ckpt' 323 | # agent.restore(save_path) 324 | 325 | # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 326 | 327 | while len(rpm) < MEMORY_WARMUP_SIZE: 328 | run_episode(env, agent, rpm) 329 | 330 | max_episode = 20000000 331 | 332 | # 开始训练 333 | episode = 0 334 | 335 | ps = datetime.now() 336 | 337 | evmax = 0 338 | 339 | while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 340 | # train part 341 | #print("episode:", episode) 342 | start = datetime.now() 343 | for i in range(0, 100): 344 | total_reward = run_episode(env, agent, rpm) 345 | episode += 1 346 | end = datetime.now() 347 | # test part 348 | eval_reward = evaluate(agent) # render=True 查看显示效果 349 | logger.info('episode:{} time:{} e_greed:{} test_reward:{}'.format( 350 | episode, (end-start).seconds, agent.e_greed, eval_reward)) 351 | 352 | # 训练结束,保存模型 353 | if eval_reward > evmax: 354 | save_path = './modelconcat_' + str(episode) + '_' + str(eval_reward) + '.ckpt' 355 | agent.save(save_path) 356 | evmax = eval_reward -------------------------------------------------------------------------------- /img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/img/1.png -------------------------------------------------------------------------------- /img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/img/2.png -------------------------------------------------------------------------------- /img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/img/3.png -------------------------------------------------------------------------------- /img/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/img/4.png -------------------------------------------------------------------------------- /img/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/img/5.png -------------------------------------------------------------------------------- /img/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/img/6.png -------------------------------------------------------------------------------- /img/7.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/img/7.gif -------------------------------------------------------------------------------- /model3_8400_1256.2.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/model3_8400_1256.2.ckpt -------------------------------------------------------------------------------- /modelconcat_30900_157.6.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/modelconcat_30900_157.6.ckpt -------------------------------------------------------------------------------- /modelsm_16400_95.8.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninetailskim/FlappyPaddle/ebdd19ce1d364f91517523e1dfe1a40dd8ff117a/modelsm_16400_95.8.ckpt -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | paddlepaddle==1.6.3 2 | parl==1.3.1 3 | gym 4 | opencv-python -------------------------------------------------------------------------------- /run3Agents.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.putenv('SDL_VIDEODRIVER', 'fbcon') 3 | os.environ["SDL_VIDEODRIVER"] = "dummy" 4 | 5 | from ple.games.flappybird import FlappyBird 6 | from ple import PLE 7 | import parl 8 | from parl import layers 9 | import paddle.fluid as fluid 10 | import copy 11 | import numpy as np 12 | import os 13 | import gym 14 | from parl.utils import logger 15 | from datetime import datetime 16 | import cv2 17 | 18 | LEARN_FREQ = 5 # 训练频率,不需要每一个step都learn,攒一些新增经验后再learn,提高效率 19 | MEMORY_SIZE = 200000 # replay memory的大小,越大越占用内存 20 | MEMORY_WARMUP_SIZE = 200 # replay_memory 里需要预存一些经验数据,再开启训练 21 | BATCH_SIZE = 32 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来 22 | LEARNING_RATE = 0.001 # 学习率 23 | GAMMA = 0.99 24 | 25 | class fc3Model(parl.Model): 26 | def __init__(self, act_dim): 27 | hid0_size = 64 28 | hid1_size = 32 29 | hid2_size = 16 30 | # 3层全连接网络 31 | self.fc0 = layers.fc(size=hid0_size, act='relu', name="fc0") 32 | self.fc1 = layers.fc(size=hid1_size, act='relu', name="fc1") 33 | self.fc2 = layers.fc(size=hid2_size, act='relu', name="fc2") 34 | self.fc3 = layers.fc(size=act_dim, act=None, name="fc3") 35 | 36 | def value(self, obs): 37 | # 定义网络 38 | # 输入state,输出所有action对应的Q,[Q(s,a1), Q(s,a2), Q(s,a3)...] 39 | h0 = self.fc0(obs) 40 | h1 = self.fc1(h0) 41 | h2 = self.fc2(h1) 42 | Q = self.fc3(h2) 43 | return Q 44 | 45 | class fc2Model(parl.Model): 46 | def __init__(self, act_dim): 47 | hid1_size = 32 48 | hid2_size = 16 49 | self.fc1 = layers.fc(size=hid1_size, act='relu', name="sfc1") 50 | self.fc2 = layers.fc(size=hid2_size, act='relu', name="sfc2") 51 | self.fc3 = layers.fc(size=act_dim, act=None, name="sfc3") 52 | 53 | def value(self, obs): 54 | h1 = self.fc1(obs) 55 | h2 = self.fc2(h1) 56 | Q = self.fc3(h2) 57 | return Q 58 | 59 | class catModel(parl.Model): 60 | def __init__(self, act_dim): 61 | hid0_size = 64 62 | hid1_size = 32 63 | hid2_size = 16 64 | 65 | self.fc0 = layers.fc(size=hid0_size, act='relu', name="catfc0") 66 | self.fc1 = layers.fc(size=hid1_size, act='relu', name="catfc1") 67 | self.fc2 = layers.fc(size=hid2_size, act='relu', name="catfc2") 68 | self.fc3 = layers.fc(size=act_dim, act=None, name="catfc3") 69 | 70 | def value(self, last_obs, obs): 71 | oobs = fluid.layers.concat(input=[last_obs, obs], axis=-1, name='concat') 72 | 73 | h0 = self.fc0(oobs) 74 | h1 = self.fc1(h0) 75 | h2 = self.fc2(h1) 76 | Q = self.fc3(h2) 77 | return Q 78 | 79 | class fcDQN(parl.Algorithm): 80 | def __init__(self, model, act_dim=None, gamma=None, lr=None): 81 | """ DQN algorithm 82 | 83 | Args: 84 | model (parl.Model): 定义Q函数的前向网络结构 85 | act_dim (int): action空间的维度,即有几个action 86 | gamma (float): reward的衰减因子 87 | lr (float): learning rate 学习率. 88 | """ 89 | self.model = model 90 | self.target_model = copy.deepcopy(model) 91 | 92 | assert isinstance(act_dim, int) 93 | assert isinstance(gamma, float) 94 | assert isinstance(lr, float) 95 | self.act_dim = act_dim 96 | self.gamma = gamma 97 | self.lr = lr 98 | 99 | def predict(self, obs): 100 | """ 使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...] 101 | """ 102 | return self.model.value(obs) 103 | 104 | def learn(self, obs, action, reward, next_obs, terminal): 105 | """ 使用DQN算法更新self.model的value网络 106 | """ 107 | # 从target_model中获取 max Q' 的值,用于计算target_Q 108 | next_pred_value = self.target_model.value(next_obs) 109 | best_v = layers.reduce_max(next_pred_value, dim=1) 110 | best_v.stop_gradient = True # 阻止梯度传递 111 | terminal = layers.cast(terminal, dtype='float32') 112 | target = reward + (1.0 - terminal) * self.gamma * best_v 113 | 114 | pred_value = self.model.value(obs) # 获取Q预测值 115 | # 将action转onehot向量,比如:3 => [0,0,0,1,0] 116 | action_onehot = layers.one_hot(action, self.act_dim) 117 | action_onehot = layers.cast(action_onehot, dtype='float32') 118 | # 下面一行是逐元素相乘,拿到action对应的 Q(s,a) 119 | # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]] 120 | # ==> pred_action_value = [[3.9]] 121 | pred_action_value = layers.reduce_sum( 122 | layers.elementwise_mul(action_onehot, pred_value), dim=1) 123 | 124 | # 计算 Q(s,a) 与 target_Q的均方差,得到loss 125 | cost = layers.square_error_cost(pred_action_value, target) 126 | cost = layers.reduce_mean(cost) 127 | optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器 128 | optimizer.minimize(cost) 129 | return cost 130 | 131 | def sync_target(self): 132 | """ 把 self.model 的模型参数值同步到 self.target_model 133 | """ 134 | self.model.sync_weights_to(self.target_model) 135 | 136 | class catDQN(parl.Algorithm): 137 | def __init__(self, model, act_dim=None, gamma=None, lr=None): 138 | """ DQN algorithm 139 | 140 | Args: 141 | model (parl.Model): 定义Q函数的前向网络结构 142 | act_dim (int): action空间的维度,即有几个action 143 | gamma (float): reward的衰减因子 144 | lr (float): learning rate 学习率. 145 | """ 146 | self.model = model 147 | self.target_model = copy.deepcopy(model) 148 | 149 | assert isinstance(act_dim, int) 150 | assert isinstance(gamma, float) 151 | assert isinstance(lr, float) 152 | self.act_dim = act_dim 153 | self.gamma = gamma 154 | self.lr = lr 155 | 156 | def predict(self, last, obs): 157 | """ 使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...] 158 | """ 159 | return self.model.value(last, obs) 160 | 161 | def learn(self, last_obs, obs, action, reward, next_obs, terminal): 162 | """ 使用DQN算法更新self.model的value网络 163 | """ 164 | # 从target_model中获取 max Q' 的值,用于计算target_Q 165 | next_pred_value = self.target_model.value(obs, next_obs) 166 | best_v = layers.reduce_max(next_pred_value, dim=1) 167 | best_v.stop_gradient = True # 阻止梯度传递 168 | terminal = layers.cast(terminal, dtype='float32') 169 | target = reward + (1.0 - terminal) * self.gamma * best_v 170 | 171 | pred_value = self.model.value(last_obs, obs) # 获取Q预测值 172 | # 将action转onehot向量,比如:3 => [0,0,0,1,0] 173 | action_onehot = layers.one_hot(action, self.act_dim) 174 | action_onehot = layers.cast(action_onehot, dtype='float32') 175 | # 下面一行是逐元素相乘,拿到action对应的 Q(s,a) 176 | # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]] 177 | # ==> pred_action_value = [[3.9]] 178 | pred_action_value = layers.reduce_sum( 179 | layers.elementwise_mul(action_onehot, pred_value), dim=1) 180 | 181 | # 计算 Q(s,a) 与 target_Q的均方差,得到loss 182 | cost = layers.square_error_cost(pred_action_value, target) 183 | cost = layers.reduce_mean(cost) 184 | optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器 185 | optimizer.minimize(cost) 186 | return cost 187 | 188 | def sync_target(self): 189 | """ 把 self.model 的模型参数值同步到 self.target_model 190 | """ 191 | self.model.sync_weights_to(self.target_model) 192 | 193 | class fc3Agent(parl.Agent): 194 | def __init__(self, 195 | algorithm, 196 | obs_dim, 197 | act_dim, 198 | e_greed=0.1, 199 | e_greed_decrement=0): 200 | assert isinstance(obs_dim, int) 201 | assert isinstance(act_dim, int) 202 | self.obs_dim = obs_dim 203 | self.act_dim = act_dim 204 | super(fc3Agent, self).__init__(algorithm) 205 | 206 | self.global_step = 0 207 | self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中 208 | 209 | self.e_greed = e_greed # 有一定概率随机选取动作,探索 210 | self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低 211 | 212 | def build_program(self): 213 | self.pred_program = fluid.Program() 214 | self.learn_program = fluid.Program() 215 | 216 | with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 217 | obs = layers.data( 218 | name='obs', shape=[self.obs_dim], dtype='float32') 219 | self.value = self.alg.predict(obs) 220 | 221 | with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量 222 | obs = layers.data( 223 | name='obs', shape=[self.obs_dim], dtype='float32') 224 | action = layers.data(name='act', shape=[1], dtype='int32') 225 | reward = layers.data(name='reward', shape=[], dtype='float32') 226 | next_obs = layers.data( 227 | name='next_obs', shape=[self.obs_dim], dtype='float32') 228 | terminal = layers.data(name='terminal', shape=[], dtype='bool') 229 | self.cost = self.alg.learn(obs, action, reward, next_obs, terminal) 230 | 231 | def sample(self, obs): 232 | sample = np.random.rand() # 产生0~1之间的小数 233 | if sample < self.e_greed: 234 | act = np.random.randint(self.act_dim) 235 | #act = 0 # 探索:每个动作都有概率被选择 236 | else: 237 | act = self.predict(obs) # 选择最优动作 238 | self.e_greed = max( 239 | 0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低 240 | return act 241 | 242 | def predict(self, obs): # 选择最优动作 243 | obs = np.expand_dims(obs, axis=0) 244 | pred_Q = self.fluid_executor.run( 245 | self.pred_program, 246 | feed={'obs': obs.astype('float32')}, 247 | fetch_list=[self.value])[0] 248 | pred_Q = np.squeeze(pred_Q, axis=0) 249 | act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作 250 | return act 251 | 252 | def learn(self, obs, act, reward, next_obs, terminal): 253 | # 每隔200个training steps同步一次model和target_model的参数 254 | if self.global_step % self.update_target_steps == 0: 255 | self.alg.sync_target() 256 | self.global_step += 1 257 | 258 | act = np.expand_dims(act, -1) 259 | feed = { 260 | 'obs': obs.astype('float32'), 261 | 'act': act.astype('int32'), 262 | 'reward': reward, 263 | 'next_obs': next_obs.astype('float32'), 264 | 'terminal': terminal 265 | } 266 | cost = self.fluid_executor.run( 267 | self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络 268 | return cost 269 | 270 | class fc2Agent(parl.Agent): 271 | def __init__(self, 272 | algorithm, 273 | obs_dim, 274 | act_dim, 275 | e_greed=0.1, 276 | e_greed_decrement=0): 277 | assert isinstance(obs_dim, int) 278 | assert isinstance(act_dim, int) 279 | self.obs_dim = obs_dim 280 | self.act_dim = act_dim 281 | super(fc2Agent, self).__init__(algorithm) 282 | 283 | self.global_step = 0 284 | self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中 285 | 286 | self.e_greed = e_greed # 有一定概率随机选取动作,探索 287 | self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低 288 | 289 | def build_program(self): 290 | self.pred_program = fluid.Program() 291 | self.learn_program = fluid.Program() 292 | 293 | with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 294 | obs = layers.data( 295 | name='obs', shape=[self.obs_dim], dtype='float32') 296 | self.value = self.alg.predict(obs) 297 | 298 | with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量 299 | obs = layers.data( 300 | name='obs', shape=[self.obs_dim], dtype='float32') 301 | action = layers.data(name='act', shape=[1], dtype='int32') 302 | reward = layers.data(name='reward', shape=[], dtype='float32') 303 | next_obs = layers.data( 304 | name='next_obs', shape=[self.obs_dim], dtype='float32') 305 | terminal = layers.data(name='terminal', shape=[], dtype='bool') 306 | self.cost = self.alg.learn(obs, action, reward, next_obs, terminal) 307 | 308 | def sample(self, obs): 309 | sample = np.random.rand() # 产生0~1之间的小数 310 | if sample < self.e_greed: 311 | act = np.random.randint(self.act_dim) 312 | #act = 0 # 探索:每个动作都有概率被选择 313 | else: 314 | act = self.predict(obs) # 选择最优动作 315 | self.e_greed = max( 316 | 0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低 317 | return act 318 | 319 | def predict(self, obs): # 选择最优动作 320 | obs = np.expand_dims(obs, axis=0) 321 | pred_Q = self.fluid_executor.run( 322 | self.pred_program, 323 | feed={'obs': obs.astype('float32')}, 324 | fetch_list=[self.value])[0] 325 | pred_Q = np.squeeze(pred_Q, axis=0) 326 | act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作 327 | return act 328 | 329 | def learn(self, obs, act, reward, next_obs, terminal): 330 | # 每隔200个training steps同步一次model和target_model的参数 331 | if self.global_step % self.update_target_steps == 0: 332 | self.alg.sync_target() 333 | self.global_step += 1 334 | 335 | act = np.expand_dims(act, -1) 336 | feed = { 337 | 'obs': obs.astype('float32'), 338 | 'act': act.astype('int32'), 339 | 'reward': reward, 340 | 'next_obs': next_obs.astype('float32'), 341 | 'terminal': terminal 342 | } 343 | cost = self.fluid_executor.run( 344 | self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络 345 | return cost 346 | 347 | class catAgent(parl.Agent): 348 | def __init__(self, 349 | algorithm, 350 | obs_dim, 351 | act_dim, 352 | e_greed=0.1, 353 | e_greed_decrement=0): 354 | assert isinstance(obs_dim, int) 355 | assert isinstance(act_dim, int) 356 | self.obs_dim = obs_dim 357 | self.act_dim = act_dim 358 | super(catAgent, self).__init__(algorithm) 359 | 360 | self.global_step = 0 361 | self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中 362 | 363 | self.e_greed = e_greed # 有一定概率随机选取动作,探索 364 | self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低 365 | 366 | def build_program(self): 367 | self.pred_program = fluid.Program() 368 | self.learn_program = fluid.Program() 369 | 370 | with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 371 | last_obs = layers.data( 372 | name='last_obs', shape=[self.obs_dim], dtype='float32') 373 | obs = layers.data( 374 | name='obs', shape=[self.obs_dim], dtype='float32') 375 | self.value = self.alg.predict(last_obs, obs) 376 | 377 | with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量 378 | last_obs = layers.data( 379 | name='last_obs', shape=[self.obs_dim], dtype='float32') 380 | obs = layers.data( 381 | name='obs', shape=[self.obs_dim], dtype='float32') 382 | action = layers.data(name='act', shape=[1], dtype='int32') 383 | reward = layers.data(name='reward', shape=[], dtype='float32') 384 | next_obs = layers.data( 385 | name='next_obs', shape=[self.obs_dim], dtype='float32') 386 | terminal = layers.data(name='terminal', shape=[], dtype='bool') 387 | self.cost = self.alg.learn(last_obs, obs, action, reward, next_obs, terminal) 388 | 389 | def sample(self, last_obs, obs): 390 | sample = np.random.rand() # 产生0~1之间的小数 391 | if sample < self.e_greed: 392 | act = np.random.randint(self.act_dim) 393 | #act = 0 # 探索:每个动作都有概率被选择 394 | else: 395 | act = self.predict(last_obs, obs) # 选择最优动作 396 | self.e_greed = max( 397 | 0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低 398 | return act 399 | 400 | def predict(self, last_obs, obs): # 选择最优动作 401 | obs = np.expand_dims(obs, axis=0) 402 | last_obs = np.expand_dims(last_obs, axis=0) 403 | pred_Q = self.fluid_executor.run( 404 | self.pred_program, 405 | feed={ 406 | 'obs': obs.astype('float32'), 407 | 'last_obs': last_obs.astype('float32') 408 | }, 409 | fetch_list=[self.value])[0] 410 | pred_Q = np.squeeze(pred_Q, axis=0) 411 | act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作 412 | return act 413 | 414 | def learn(self, last_obs, obs, act, reward, next_obs, terminal): 415 | # 每隔200个training steps同步一次model和target_model的参数 416 | if self.global_step % self.update_target_steps == 0: 417 | self.alg.sync_target() 418 | self.global_step += 1 419 | 420 | act = np.expand_dims(act, -1) 421 | feed = { 422 | 'last_obs': last_obs.astype('float32'), 423 | 'obs': obs.astype('float32'), 424 | 'act': act.astype('int32'), 425 | 'reward': reward, 426 | 'next_obs': next_obs.astype('float32'), 427 | 'terminal': terminal 428 | } 429 | cost = self.fluid_executor.run( 430 | self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络 431 | return cost 432 | 433 | import random 434 | import collections 435 | import numpy as np 436 | 437 | totale = 0 438 | 439 | import sys 440 | videoname = sys.argv[1] 441 | 442 | # 评估 agent, 跑 5 个episode,总reward求平均 443 | def evaluate(agent1, agent2, agent3): 444 | input("开始比赛") 445 | fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') 446 | 447 | 448 | frame_number = 0 449 | env = PLE(game, fps=30, display_screen=True) 450 | actionset = env.getActionSet() 451 | eval_reward = [] 452 | 453 | for i in range(5): 454 | output_movie = cv2.VideoWriter(videoname+'_'+str(i)+'.mp4', fourcc, 20, (288, 512)) 455 | env.init() 456 | env.reset_game() 457 | dstate = env.getGameState() 458 | # print(dstate) 459 | obs = list(dstate.values()) 460 | 461 | last_obs = np.zeros_like(obs[0:8]) 462 | episode_reward = 0 463 | while True: 464 | obs1 = obs[0:8] 465 | obs2 = obs[8:16] 466 | obs3 = obs[16:24] 467 | action1 = agent1.predict(obs1) 468 | action2 = agent2.predict(obs2) 469 | action3 = agent3.predict(last_obs,obs3) 470 | 471 | finalaction = 0 472 | if action1 == 0: 473 | finalaction += 1 474 | if action2 == 0: 475 | finalaction += 2 476 | if action3 == 0: 477 | finalaction += 4 478 | # print("action1: ", action1) 479 | # print("action2: ", action2) 480 | # print("action3: ", action3) 481 | # print("action: ", finalaction) 482 | # print(obs) 483 | # print(obs1) 484 | # print(obs2) 485 | # print(obs3) 486 | if finalaction == 0: 487 | finalaction = None 488 | score = env.score() 489 | 490 | observation = env.getScreenRGB() 491 | observation = cv2.transpose(observation) 492 | font = cv2.FONT_HERSHEY_SIMPLEX 493 | observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) 494 | ss = observation.shape 495 | observation = cv2.resize(observation, (ss[1] * 2, ss[0] * 2)) 496 | output_movie.write(observation) 497 | cv2.imshow("ss", observation) 498 | cv2.waitKey(30) # 预测动作,只选最优动作 499 | 500 | reward = env.act(finalaction) 501 | last_obs = obs3 502 | dstate = env.getGameState() 503 | # print(dstate) 504 | obs = list(dstate.values()) 505 | done = env.game_over() 506 | episode_reward += reward 507 | if done: 508 | break 509 | # input() 510 | eval_reward.append(episode_reward) 511 | cv2.destroyAllWindows() 512 | output_movie.release() 513 | input() 514 | return np.mean(eval_reward) 515 | 516 | 517 | game = FlappyBird() 518 | env = PLE(game, fps=30, display_screen=False) # CartPole-v0: 预期最后一次评估总分 > 180(最大值是200) 519 | action_dim = 2 # CartPole-v0: 2 520 | obs_shape = len(env.getGameState()) // 3 # CartPole-v0: (4,) 521 | 522 | print(env.getActionSet()) 523 | 524 | print(obs_shape) 525 | 526 | # 根据parl框架构建agent 527 | fc3model = fc3Model(act_dim=action_dim) 528 | fc3algorithm = fcDQN(fc3model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) 529 | fc3agent = fc3Agent(fc3algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, e_greed_decrement=1e-6) 530 | 531 | fc2model = fc2Model(act_dim=action_dim) 532 | fc2algorithm = fcDQN(fc2model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) 533 | fc2agent = fc2Agent(fc2algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, e_greed_decrement=1e-6) 534 | 535 | catmodel = catModel(act_dim=action_dim) 536 | catalgorithm = catDQN(catmodel, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) 537 | catagent = catAgent(catalgorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, e_greed_decrement=1e-6) 538 | 539 | # 加载模型 540 | save_path = './model3_8400_1256.2.ckpt' 541 | fc3agent.restore(save_path) 542 | 543 | save_path = './modelsm_16400_95.8.ckpt' 544 | fc2agent.restore(save_path) 545 | 546 | save_path = './modelconcat_30900_157.6.ckpt' 547 | catagent.restore(save_path) 548 | 549 | eval_reward = evaluate(fc3agent, fc2agent, catagent) # render=True 查看显示效果 550 | #logger.info('episode:{} time:{} e_greed:{} test_reward:{}'.format(1, (end-start).seconds, agent.e_greed, eval_reward)) --------------------------------------------------------------------------------