├── .gitignore ├── README.md ├── better_snake ├── models │ └── snake-basterd.zip ├── old_env │ ├── apple.py │ ├── environment.py │ └── snake.py ├── old_env_2 │ ├── cube.py │ ├── environment.py │ ├── self_play.py │ └── snake.py └── ppo.py ├── flappyb ├── dqn_rainbow.py ├── dqn_v2.py ├── dqn_v3.py ├── environment │ ├── assets │ │ ├── Pong-653x400.png │ │ ├── all_fonts_script.py │ │ ├── bg.png │ │ ├── bird.png │ │ ├── pipe.png │ │ ├── pipe_long.png │ │ └── sapcraft.jpg │ ├── bird.py │ ├── environment.py │ └── pipe.py ├── lib │ ├── common.py │ ├── dqn_model.py │ ├── dqn_rainbow.py │ └── ppo_model.py ├── models │ ├── cross_entropy │ │ └── batchsize=100-hiddensize=256-lr=0.01-gamma=.9-PART=240.pt │ ├── dqn │ │ ├── dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-LOADED=HARDCORE-6300-lrMax=0.4-nextPipe-HELL-PART=1000.h5 │ │ ├── dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=6650.h5 │ │ └── dqn-expdecay=0.999995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-HARDCORE-PART=6300.h5 │ ├── flappyb-test-the-rainbow254 │ ├── flappyb-test-the-rainbow350 │ └── flappyb-test-the-rainbow87 ├── play_dqn_rainbow.py ├── play_ppo.py ├── play_self.py ├── ppo.py └── saves │ └── ppo-test-flappyb │ ├── best_+10.400_555000.dat │ ├── best_+11.270_556000.dat │ ├── best_+131.310_576000.dat │ ├── best_+20.470_558000.dat │ ├── best_+4.650_165000.dat │ ├── best_+4.860_370000.dat │ ├── best_+44.070_560000.dat │ ├── best_+44.560_561000.dat │ ├── best_+5.290_475000.dat │ ├── best_+5.530_495000.dat │ ├── best_+5.740_516000.dat │ ├── best_+5.820_538000.dat │ ├── best_+56.790_570000.dat │ ├── best_+6.250_539000.dat │ ├── best_+6.820_542000.dat │ ├── best_+7.200_547000.dat │ └── best_+8.690_550000.dat ├── old_agents ├── cross_entropy.py ├── cross_entropy_advanced.py ├── dqn_snake_v2.py ├── q_iteration.py ├── q_learning.py └── value_iteration.py ├── requirements.txt ├── runTensorBoard └── snake ├── base_ppo.py ├── env_new ├── cube.py ├── environment.py ├── self_play.py └── snake.py ├── environment ├── apple.py ├── environment.py └── snake.py ├── lib ├── common.py ├── dqn_model.py ├── dqn_rainbow.py └── ppo_model.py ├── play_ppo.py ├── ppo.py └── self_play.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/python,pycharm,sublimetext 2 | # Edit at https://www.gitignore.io/?templates=python,pycharm,sublimetext 3 | 4 | ### PyCharm ### 5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 7 | 8 | # User-specific stuff 9 | .idea/**/workspace.xml 10 | .idea/**/tasks.xml 11 | .idea/**/usage.statistics.xml 12 | .idea/**/dictionaries 13 | .idea/**/shelf 14 | 15 | # Generated files 16 | .idea/**/contentModel.xml 17 | 18 | # Sensitive or high-churn files 19 | .idea/**/dataSources/ 20 | .idea/**/dataSources.ids 21 | .idea/**/dataSources.local.xml 22 | .idea/**/sqlDataSources.xml 23 | .idea/**/dynamic.xml 24 | .idea/**/uiDesigner.xml 25 | .idea/**/dbnavigator.xml 26 | 27 | # Gradle 28 | .idea/**/gradle.xml 29 | .idea/**/libraries 30 | 31 | # Gradle and Maven with auto-import 32 | # When using Gradle or Maven with auto-import, you should exclude module files, 33 | # since they will be recreated, and may cause churn. Uncomment if using 34 | # auto-import. 35 | # .idea/modules.xml 36 | # .idea/*.iml 37 | # .idea/modules 38 | 39 | # CMake 40 | cmake-build-*/ 41 | 42 | # Mongo Explorer plugin 43 | .idea/**/mongoSettings.xml 44 | 45 | # File-based project format 46 | *.iws 47 | 48 | # IntelliJ 49 | out/ 50 | 51 | # mpeltonen/sbt-idea plugin 52 | .idea_modules/ 53 | 54 | # JIRA plugin 55 | atlassian-ide-plugin.xml 56 | 57 | # Cursive Clojure plugin 58 | .idea/replstate.xml 59 | 60 | # Crashlytics plugin (for Android Studio and IntelliJ) 61 | com_crashlytics_export_strings.xml 62 | crashlytics.properties 63 | crashlytics-build.properties 64 | fabric.properties 65 | 66 | # Editor-based Rest Client 67 | .idea/httpRequests 68 | 69 | # Android studio 3.1+ serialized cache file 70 | .idea/caches/build_file_checksums.ser 71 | 72 | # JetBrains templates 73 | **___jb_tmp___ 74 | 75 | ### PyCharm Patch ### 76 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 77 | 78 | # *.iml 79 | # modules.xml 80 | # .idea/misc.xml 81 | # *.ipr 82 | 83 | # Sonarlint plugin 84 | .idea/sonarlint 85 | 86 | ### Python ### 87 | # Byte-compiled / optimized / DLL files 88 | __pycache__/ 89 | *.py[cod] 90 | *$py.class 91 | 92 | # C extensions 93 | *.so 94 | 95 | # Distribution / packaging 96 | .Python 97 | build/ 98 | develop-eggs/ 99 | dist/ 100 | downloads/ 101 | eggs/ 102 | .eggs/ 103 | lib64/ 104 | parts/ 105 | sdist/ 106 | var/ 107 | wheels/ 108 | pip-wheel-metadata/ 109 | share/python-wheels/ 110 | *.egg-info/ 111 | .installed.cfg 112 | *.egg 113 | MANIFEST 114 | 115 | # PyInstaller 116 | # Usually these files are written by a python script from a template 117 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 118 | *.manifest 119 | *.spec 120 | 121 | # Installer logs 122 | pip-log.txt 123 | pip-delete-this-directory.txt 124 | 125 | # Unit test / coverage reports 126 | htmlcov/ 127 | .tox/ 128 | .nox/ 129 | .coverage 130 | .coverage.* 131 | .cache 132 | nosetests.xml 133 | coverage.xml 134 | *.cover 135 | .hypothesis/ 136 | .pytest_cache/ 137 | 138 | # Translations 139 | *.mo 140 | *.pot 141 | 142 | # Django stuff: 143 | *.log 144 | local_settings.py 145 | db.sqlite3 146 | 147 | # Flask stuff: 148 | instance/ 149 | .webassets-cache 150 | 151 | # Scrapy stuff: 152 | .scrapy 153 | 154 | # Sphinx documentation 155 | docs/_build/ 156 | 157 | # PyBuilder 158 | target/ 159 | 160 | # Jupyter Notebook 161 | .ipynb_checkpoints 162 | 163 | # IPython 164 | profile_default/ 165 | ipython_config.py 166 | 167 | # pyenv 168 | .python-version 169 | 170 | # pipenv 171 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 172 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 173 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 174 | # install all needed dependencies. 175 | #Pipfile.lock 176 | 177 | # celery beat schedule file 178 | celerybeat-schedule 179 | 180 | # SageMath parsed files 181 | *.sage.py 182 | 183 | # Environments 184 | .env 185 | .venv 186 | env/ 187 | venv/ 188 | ENV/ 189 | env.bak/ 190 | venv.bak/ 191 | 192 | # Spyder project settings 193 | .spyderproject 194 | .spyproject 195 | 196 | # Rope project settings 197 | .ropeproject 198 | 199 | # mkdocs documentation 200 | /site 201 | 202 | # mypy 203 | .mypy_cache/ 204 | .dmypy.json 205 | dmypy.json 206 | 207 | # Pyre type checker 208 | .pyre/ 209 | 210 | ### SublimeText ### 211 | # Cache files for Sublime Text 212 | *.tmlanguage.cache 213 | *.tmPreferences.cache 214 | *.stTheme.cache 215 | 216 | # Workspace files are user-specific 217 | *.sublime-workspace 218 | 219 | # Project files should be checked into the repository, unless a significant 220 | # proportion of contributors will probably not be using Sublime Text 221 | # *.sublime-project 222 | 223 | # SFTP configuration file 224 | sftp-config.json 225 | 226 | # Package control specific files 227 | Package Control.last-run 228 | Package Control.ca-list 229 | Package Control.ca-bundle 230 | Package Control.system-ca-bundle 231 | Package Control.cache/ 232 | Package Control.ca-certs/ 233 | Package Control.merged-ca-bundle 234 | Package Control.user-ca-bundle 235 | oscrypto-ca-bundle.crt 236 | bh_unicode_properties.cache 237 | 238 | # Sublime-github package stores a github token in this file 239 | # https://packagecontrol.io/packages/sublime-github 240 | GitHub.sublime-settings 241 | 242 | # End of https://www.gitignore.io/api/python,pycharm,sublimetext 243 | 244 | # Custom 245 | .runs/** 246 | .idea/** 247 | runs/ 248 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning on games 2 | 3 | The algorithms were implemented using the book: "Deep Reinforcement Learning Hands-On" written by Maxim Lapan. 4 | He provides a github repo with multiple implementations, that can be found in here: 5 | https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On 6 | 7 | ## Project structure 8 | 9 | The root folder consits of: 10 | 1. Different games, every game has a RL algorithm, models and graphs 11 | 2. requirements.txt (probably not up to date) 12 | 3. runTensorBoard [dir], runs tensorboard on a choosen directory 13 | 4. old_agents, implementations of weaker RL algorithms 14 | 15 | When you want to try out trained model, you have to set the LEARN flag in the agent file to false. 16 | Different models are trained on different observations, so not every combination will work. 17 | But the models name indicates the settings for the parameters. 18 | 19 | ## Current Algorithms 20 | 21 | * DQN, a simple dqn implementation that offers experience replay. This is currently the best algorithm in this repository. 22 | * PPO, what everyone currently uses. 23 | * Value iteration, a good starting point. 24 | * cross_entropy, another starting point. 25 | * others ... 26 | -------------------------------------------------------------------------------- /better_snake/models/snake-basterd.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/better_snake/models/snake-basterd.zip -------------------------------------------------------------------------------- /better_snake/old_env/apple.py: -------------------------------------------------------------------------------- 1 | 2 | # https://www.youtube.com/watch?v=AaGK-fj-BAM&t=630s 3 | import pygame 4 | import random 5 | 6 | 7 | class Apple: 8 | 9 | def __init__(self, screen, s_width, s_height, color, scale): 10 | 11 | self.screen = screen 12 | self.s_width = s_width 13 | self.s_height = s_height 14 | self.color = color 15 | self.scale = scale 16 | 17 | self.place_apple(None) 18 | 19 | def draw(self): 20 | rect = pygame.rect.Rect(self.x, self.y, self.scale, self.scale) 21 | pygame.draw.rect(self.screen, self.color, rect) 22 | 23 | def eat(self, snake_x, snake_y, tail): 24 | if self.x == snake_x and self.y == snake_y: 25 | self.place_apple(tail) 26 | return True 27 | return False 28 | 29 | def place_apple(self, tail): 30 | 31 | cols = (self.s_width - self.scale) / self.scale 32 | rows = (self.s_height - self.scale) / self.scale 33 | 34 | rand_x = 0 35 | rand_y = 0 36 | 37 | bad_position = True 38 | 39 | if tail is None: 40 | bad_position = False 41 | rand_x = random.randint(0, cols) 42 | rand_y = random.randint(0, rows) 43 | 44 | while bad_position: 45 | bad_position = False 46 | 47 | rand_x = random.randint(0, cols) 48 | rand_y = random.randint(0, rows) 49 | 50 | for i in tail: 51 | if rand_x == int(i.x / self.scale) and rand_y == int(i.y / self.scale): 52 | bad_position = True 53 | break 54 | 55 | self.x = rand_x * self.scale 56 | self.y = rand_y * self.scale 57 | -------------------------------------------------------------------------------- /better_snake/old_env/environment.py: -------------------------------------------------------------------------------- 1 | # Game was made with the help of https://www.youtube.com/watch?v=cXgA1d_E-jY 2 | import gym 3 | import gym.spaces 4 | import time 5 | import pygame 6 | import random 7 | import enum 8 | 9 | import numpy as np 10 | 11 | from snakeenv.snake import Snake 12 | from snakeenv.apple import Apple 13 | 14 | # AI PARAMETERS ############################################################### 15 | BUFFER_SIZE = 2 16 | OBSERVATION_SIZE = 10 * 10 17 | ACTIONS = [0, 1, 2, 3] 18 | ACTION_SIZE = 4 19 | 20 | # GAME PARAMETERS ############################################################# 21 | SCALE = 60 22 | SCREEN_SIZE = WIDTH, HEIGHT = (600, 600) # for 5*5 go 300*300, 60 23 | # for 10*10 go 600*600, 60 24 | BACKGROUND = (72, 72, 72) 25 | SNAKE_COLOR = (57, 255, 20) 26 | APPPLE_COLOR = (255, 8, 0) 27 | FONT = 'dyuthi' 28 | 29 | """ Rewards 30 | 1. first apple +1 31 | 2. every next apple n+1 32 | 3. hit wall -1 33 | 4. ate self -2 34 | 5. does nothing 0.1 35 | """ 36 | """ Observations 37 | 1. apple +1 38 | 3. snake head = 0.5 39 | 4. every snake body -0.01 40 | 5. emtpy cell = -1 41 | """ 42 | """ 43 | Interace: 44 | reset(): resets the whole environment 45 | step(action): performs one action onto the environment 46 | step_buffer(action): performs one action onto the environment, 47 | returns 4 states for experience replay 48 | get_action_random(): obtain an imporoved random action 49 | get_observation_size(): obtain size of observation 50 | get_action_size(): obtain size of action 51 | """ 52 | 53 | 54 | class Actions(enum.Enum): 55 | Up = 0 56 | Right = 1 57 | Down = 2 58 | Left = 3 59 | 60 | 61 | class SnakeEnvironment(gym.Env): 62 | 63 | def __init__(self, draw=True, fps=100, debug=False, animation=False): 64 | 65 | super(SnakeEnvironment, self).__init__() 66 | self.observation_space = gym.spaces.Discrete(n=OBSERVATION_SIZE*BUFFER_SIZE) 67 | self.action_space = gym.spaces.Discrete(n=len(Actions)) 68 | 69 | if draw: 70 | pygame.init() 71 | pygame.display.set_caption('NN Snake') 72 | self.font_game_over = pygame.font.SysFont("ani", 72) 73 | 74 | self.draw = draw 75 | self.fps = fps 76 | self.debug = debug 77 | self.animation = animation 78 | self.screen = pygame.display.set_mode(SCREEN_SIZE) 79 | 80 | self.reward = 0 81 | self.score = 0 82 | self.is_done = False 83 | self.steps_without_apple = 0 84 | 85 | self.current_observation = None 86 | self.last_observation = None 87 | 88 | # ML INTERFACE ############################################################ 89 | def reset(self): 90 | """ Resets the whole environment. Must be called in the beginning. """ 91 | 92 | self.snake = Snake(self.screen, WIDTH, HEIGHT, SNAKE_COLOR, 93 | BACKGROUND, SCALE) 94 | self.apple = Apple(self.screen, WIDTH, HEIGHT, APPPLE_COLOR, SCALE) 95 | 96 | self.reward = 0 97 | self.score = 0 98 | self.is_done = False 99 | self.steps_without_apple = 0 100 | 101 | self.current_observation = None 102 | self.last_observation = None 103 | 104 | obs, reward, is_done, _ = self.step(1) 105 | 106 | if self.draw: 107 | self.countdown() 108 | 109 | return obs 110 | 111 | # The actual game step #################################################### 112 | def step(self, action): 113 | 114 | if isinstance(action, np.ndarray): 115 | idx = -1 116 | highest_idx = 0 117 | highest_val = -1 118 | for i in action: 119 | idx += 1 120 | if i > highest_val: 121 | highest_idx = idx 122 | highest_val = i 123 | action = highest_idx 124 | 125 | current_reward = 0 126 | 127 | self.snake.handle_events_ai(action) 128 | 129 | if self.apple.eat(self.snake.x, self.snake.y, self.snake.tail): 130 | self.snake.update(True) 131 | self.steps_without_apple = 0 132 | self.score += 1 133 | current_reward = 1 134 | # if self.score == 10: 135 | # current_reward = 1 136 | # else: 137 | # current_reward = self.score / 10 138 | else: 139 | self.snake.update(False) 140 | current_reward = 0.1 141 | self.steps_without_apple += 1 142 | # if self.steps_without_apple > 20: 143 | # current_reward = 0 144 | if self.steps_without_apple > 500: 145 | current_reward = -1 146 | self.game_over() 147 | 148 | if self.snake.check_if_hit_wall(): 149 | current_reward = -1 150 | self.game_over() 151 | 152 | if self.snake.check_if_ate_self(): 153 | current_reward = -1 154 | self.game_over() 155 | 156 | if self.draw: 157 | self.screen.fill(BACKGROUND) 158 | self.snake.draw() 159 | self.apple.draw() 160 | pygame.display.update() 161 | 162 | obs = self.get_observation_space() 163 | time.sleep(self.fps / 1000.0) 164 | 165 | return obs, current_reward, self.is_done, None 166 | 167 | def get_observation_space(self): 168 | 169 | new_obs = [] 170 | 171 | # create 2d matrix 172 | for i in range(int(WIDTH / SCALE)): 173 | new_obs.append([]) 174 | for j in range(int(WIDTH / SCALE)): 175 | new_obs[i].append(-1) 176 | 177 | # add apple 178 | x_apple = int(self.apple.x / SCALE) 179 | y_apple = int(self.apple.y / SCALE) 180 | new_obs[y_apple][x_apple] = 1 181 | 182 | # add snake 183 | x_snake = int(self.snake.x / SCALE) 184 | y_snake = int(self.snake.y / SCALE) 185 | new_obs[y_snake][x_snake] = 0.8 186 | 187 | # tail 188 | for i in self.snake.tail: 189 | x_snake = int(i.x / SCALE) 190 | y_snake = int(i.y / SCALE) 191 | new_obs[y_snake][x_snake] = 0.5 192 | 193 | current_obs = [] 194 | for i in new_obs: 195 | for j in i: 196 | current_obs.append(j) 197 | 198 | if self.draw and self.debug: 199 | for i in new_obs: 200 | print(i, '\n') 201 | print('\n') 202 | 203 | return_obs = np.array(current_obs) 204 | 205 | ####### 206 | # if self.last_observation == None: 207 | # self.last_observation = current_obs 208 | 209 | # return_obs = [] 210 | 211 | # for i in self.last_observation: 212 | # return_obs.append(i) 213 | # for i in current_obs: 214 | # return_obs.append(i) 215 | 216 | # return_obs = np.array(return_obs) 217 | 218 | # cnt = 0 219 | # for i in return_obs: 220 | # cnt += 1 221 | # print(' ', i, ' ', end='') 222 | # if cnt % 10 == 0: 223 | # print('') 224 | # if cnt % 100 == 0: 225 | # print('') 226 | # print('') 227 | # print('') 228 | 229 | # self.last_observation = current_obs 230 | ####### 231 | 232 | return return_obs 233 | 234 | def get_action_random(self): 235 | return random.randint(0, 3) 236 | 237 | # HUMAN STUFF ############################################################ 238 | 239 | def reset_human_game(self): 240 | """ Resets the whole environment. Must be called in the beginning. """ 241 | 242 | self.clock = pygame.time.Clock() 243 | self.time_elapsed_since_last_action = 0 244 | self.global_time = 0 245 | 246 | self.screen = pygame.display.set_mode(SCREEN_SIZE) 247 | self.snake = Snake(self.screen, WIDTH, HEIGHT, SNAKE_COLOR, 248 | BACKGROUND, SCALE) 249 | self.apple = Apple(self.screen, WIDTH, HEIGHT, APPPLE_COLOR, SCALE) 250 | 251 | self.reward = 0 252 | self.score = 0 253 | self.is_done = False 254 | self.steps_without_apple = 0 255 | 256 | self.current_observation = None 257 | self.last_observation = None 258 | 259 | if self.draw: 260 | self.countdown() 261 | 262 | def run_human_game(self): 263 | 264 | while not self.is_done: 265 | 266 | self.handle_events_human() 267 | self.snake.handle_events_human() 268 | 269 | if self.apple.eat(self.snake.x, self.snake.y, self.snake.tail): 270 | self.snake.update(True) 271 | else: 272 | self.snake.update(False) 273 | 274 | if self.snake.check_if_hit_wall(): 275 | self.game_over() 276 | 277 | if self.snake.check_if_ate_self(): 278 | self.game_over() 279 | 280 | if self.draw: 281 | self.screen.fill(BACKGROUND) 282 | self.snake.draw() 283 | self.apple.draw() 284 | pygame.display.update() 285 | 286 | time.sleep (self.fps / 1000.0); 287 | 288 | def handle_events_human(self): 289 | for event in pygame.event.get(): 290 | if event.type == pygame.QUIT: 291 | self.is_done = False 292 | pygame.quit() 293 | 294 | def countdown(self): 295 | if not self.animation: 296 | return 297 | for _ in range(3, 0, -1): 298 | self.screen.fill(BACKGROUND) 299 | self.snake.draw() 300 | self.apple.draw() 301 | text_start = pygame.font.SysFont(FONT, 80). \ 302 | render("Start in {}".format(_), True, (0, 0, 0)) 303 | self.screen.blit(text_start, 304 | (text_start.get_width() // 305 | 2, text_start.get_height() // 2)) 306 | pygame.display.flip() 307 | time.sleep(0.5) 308 | 309 | def game_over(self): 310 | self.is_done = True 311 | if not self.animation: 312 | return 313 | if self.draw: 314 | text = pygame.font.SysFont(FONT, 28).render( 315 | "Game Over!".format(self.reward), True, (0, 0, 0)) 316 | self.screen.blit(text, (320 - text.get_width() // 317 | 2, 240 - text.get_height() // 2)) 318 | pygame.display.flip() 319 | time.sleep(0.5) 320 | 321 | 322 | 323 | 324 | 325 | 326 | # if self.last_observation == None: 327 | # self.current_observation = current_obs 328 | 329 | # self.last_observation = self.current_observation 330 | # self.current_observation = current_obs 331 | 332 | # return_obs = [] 333 | 334 | # for i in self.last_observation: 335 | # return_obs.append(i) 336 | 337 | # for i in self.current_observation: 338 | # return_obs.append(i) 339 | 340 | # current_obs = np.array(current_obs) 341 | 342 | # for i in range(25): 343 | # if i%5==0: 344 | # print('') 345 | # print(' ' , self.last_observation[i] , ' ' , end='') 346 | 347 | # print('') 348 | # for i in range(25): 349 | # if i%5==0: 350 | # print('') 351 | # print(' ' ,self.current_observation[i], ' ' , end='') 352 | -------------------------------------------------------------------------------- /better_snake/old_env/snake.py: -------------------------------------------------------------------------------- 1 | # https://www.youtube.com/watch?v=AaGK-fj-BAM&t=630s 2 | import pygame 3 | 4 | 5 | class Snake: 6 | 7 | def __init__(self, screen, s_width, s_height, color, body_color, scale): 8 | 9 | self.screen = screen 10 | self.s_width = s_width 11 | self.s_height = s_height 12 | self.color = color 13 | self.body_color = body_color 14 | self.scale = scale 15 | 16 | self.scale = scale 17 | 18 | self.x = 2 * scale 19 | self.y = 2 * scale 20 | 21 | self.x_speed = 1 22 | self.y_speed = 0 23 | 24 | self.tail = [Vector(self.x, self.y)] 25 | 26 | def handle_events_human(self): 27 | keys = pygame.key.get_pressed() 28 | if keys[pygame.K_UP]: 29 | self.move(0, -1) 30 | if keys[pygame.K_RIGHT]: 31 | self.move(1, 0) 32 | if keys[pygame.K_DOWN]: 33 | self.move(0, 1) 34 | if keys[pygame.K_LEFT]: 35 | self.move(-1, 0) 36 | 37 | def handle_events_ai(self, action): 38 | # print(action) 39 | if action == 0: 40 | self.move(0, -1) 41 | if action == 1: 42 | self.move(1, 0) 43 | if action == 2: 44 | self.move(0, 1) 45 | if action == 3: 46 | self.move(-1, 0) 47 | 48 | def draw(self): 49 | 50 | for i in self.tail: 51 | rect = pygame.rect.Rect( 52 | i.x + 1, i.y + 1, self.scale - 2, self.scale - 2) 53 | pygame.draw.rect(self.screen, self.color, rect) 54 | rect = pygame.rect.Rect( 55 | i.x + 16, i.y + 16, self.scale - 32, self.scale - 32) 56 | pygame.draw.rect(self.screen, self.body_color, rect) 57 | 58 | rect = pygame.rect.Rect( 59 | self.x, self.y, self.scale, self.scale) 60 | pygame.draw.rect(self.screen, self.color, rect) 61 | 62 | def update(self, ate_apple): 63 | 64 | length = len(self.tail) 65 | 66 | if ate_apple: 67 | self.tail.append(Vector(self.x, self.y)) 68 | else: 69 | for i in range(length - 1): 70 | self.tail[i] = self.tail[i + 1] 71 | self.tail[length - 1] = Vector(self.x, self.y) 72 | 73 | self.x = self.x + self.x_speed * self.scale 74 | self.y = self.y + self.y_speed * self.scale 75 | 76 | if self.x < 0: 77 | self.x = 0 78 | if self.x > self.s_width - self.scale: 79 | self.x = self.s_width - self.scale 80 | if self.y < 0: 81 | self.y = 0 82 | if self.y > self.s_height - self.scale: 83 | self.y = self.s_height - self.scale 84 | 85 | def move(self, x, y): 86 | self.x_speed = x 87 | self.y_speed = y 88 | 89 | def check_if_hit_wall(self): 90 | if self.x == -1: 91 | return True 92 | if self.x == self.s_width: 93 | return True 94 | if self.y == -1: 95 | return True 96 | if self.y == self.s_height: 97 | return True 98 | 99 | def check_if_ate_self(self): 100 | for i in self.tail: 101 | if (self.x == i.x) and (self.y == i.y): 102 | return True 103 | 104 | 105 | class Vector: 106 | 107 | def __init__(self, x, y): 108 | self.x = x 109 | self.y = y 110 | -------------------------------------------------------------------------------- /better_snake/old_env_2/cube.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | 3 | 4 | class Cube(object): 5 | 6 | def __init__(self, pos, rows, w, dirnx=1, dirny=0, color=(255, 0, 0)): 7 | 8 | self.pos = pos 9 | self.dirnx = dirnx 10 | self.dirny = dirny 11 | 12 | self.rows = rows 13 | self.w = w 14 | 15 | self.color = color 16 | 17 | def move(self, dirnx, dirny): 18 | self.dirnx = dirnx 19 | self.dirny = dirny 20 | self.pos = (self.pos[0] + self.dirnx, self.pos[1] + self.dirny) 21 | 22 | def draw(self, surface, eyes=False): 23 | dis = self.w // self.rows 24 | i = self.pos[0] 25 | j = self.pos[1] 26 | 27 | pygame.draw.rect(surface, self.color, (i*dis+1,j*dis+1, dis-2, dis-2)) 28 | if eyes: 29 | centre = dis//2 30 | radius = 3 31 | circleMiddle = (i*dis+centre-radius,j*dis+8) 32 | circleMiddle2 = (i*dis + dis -radius*2, j*dis+8) 33 | pygame.draw.circle(surface, (0,0,0), circleMiddle, radius) 34 | pygame.draw.circle(surface, (0,0,0), circleMiddle2, radius) -------------------------------------------------------------------------------- /better_snake/old_env_2/environment.py: -------------------------------------------------------------------------------- 1 | from environment.cube import Cube 2 | from environment.snake import Snake 3 | 4 | import gym 5 | import pygame 6 | 7 | import numpy as np 8 | import random 9 | import enum 10 | import time 11 | 12 | #import tkinker as tk 13 | #from tkinter import messagebox 14 | 15 | # snake obs 16 | # body = head 0.9, b[0] = 0.8, b[1] = 0.79 ... 17 | 18 | 19 | W = 500 20 | H = 500 21 | BUFFER_SIZE = 1 22 | 23 | 24 | class Actions(enum.Enum): 25 | Up = 0 26 | Right = 1 27 | Down = 2 28 | Left = 3 29 | 30 | 31 | class SnakeEnvironment(gym.Env): 32 | 33 | def __init__(self, draw=True, speed=10000, rows=20, animation=True): 34 | super(SnakeEnvironment, self).__init__() 35 | 36 | 37 | self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(rows, rows), dtype=np.uint8) 38 | self.action_space = gym.spaces.Discrete(n=len(Actions)) 39 | 40 | self.draw = draw 41 | self.speed = speed 42 | self.rows = rows 43 | self.animation = animation 44 | 45 | self.snake = Snake((255, 0, 0), (2, 2), self.rows, W) 46 | self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0)) 47 | 48 | self.is_done = False 49 | self.reward = 0 50 | self.step_without_apple = 0 51 | 52 | self.surf = pygame.display.set_mode((W, H)) 53 | self.clock = pygame.time.Clock() 54 | 55 | if draw: 56 | pygame.init() 57 | self.font_game_over = pygame.font.SysFont("ani", 72) 58 | 59 | """ Must alwasy be calles in the beginning. """ 60 | def reset(self): 61 | self.countdown() 62 | 63 | self.snake.reset((2, 2)) 64 | self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0)) 65 | self.is_done = False 66 | self.reward = 0 67 | self.step_without_apple = 0 68 | 69 | self.surf = pygame.display.set_mode((W, H)) 70 | self.clock = pygame.time.Clock() 71 | 72 | obs, reward, is_done, _ = self.step(1) 73 | 74 | return obs 75 | 76 | def step(self, action): 77 | pygame.time.delay(50) # lower is faster 78 | self.clock.tick(self.speed) # lower is slower 79 | 80 | current_reward = 0 81 | 82 | self.snake.move_ai(action) 83 | # self.snake.move_human() 84 | 85 | if self.snake.ate_itself(): 86 | current_reward = -1 87 | self.game_over() 88 | 89 | self.step_without_apple += 1 90 | if self.step_without_apple == 250: 91 | self.game_over() 92 | 93 | if self.snake.body[0].pos == self.snack.pos: 94 | self.snake.add_cube() 95 | self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0)) 96 | self.reward += 1 97 | current_reward = 1 98 | self.step_without_apple = 0 99 | 100 | self.redraw_window() 101 | 102 | obs = self.get_observation_space() 103 | 104 | return obs, current_reward, self.is_done, {} 105 | 106 | def get_observation_space(self): 107 | 108 | new_obs = [] 109 | 110 | # create 2d matrix 111 | for i in range(self.rows): 112 | new_obs.append([]) 113 | for j in range(self.rows): 114 | new_obs[i].append(-1) 115 | 116 | # add apple 117 | x_apple = self.snack.pos[0] 118 | y_apple = self.snack.pos[1] 119 | new_obs[y_apple][x_apple] = 1 120 | 121 | # tail 122 | for i, c in enumerate(self.snake.body): 123 | x_snake = c.pos[0] 124 | y_snake = c.pos[1] 125 | 126 | if x_snake == -1 or x_snake == self.rows: 127 | print('Wtf, this error occured!') 128 | self.game_over() 129 | return 130 | if y_snake == -1 or y_snake == self.rows: 131 | print('Wtf, this error occured!') 132 | self.game_over() 133 | return 134 | 135 | new_obs[y_snake][x_snake] = 0.5 136 | 137 | # add snake head 138 | x_snake = self.snake.head.pos[0] 139 | y_snake = self.snake.head.pos[1] 140 | if x_snake == -1 or x_snake == self.rows: 141 | print('Wtf, this error occured!') 142 | self.game_over() 143 | return 144 | if y_snake == -1 or y_snake == self.rows: 145 | print('Wtf, this error occured!') 146 | self.game_over() 147 | return 148 | new_obs[y_snake][x_snake] = 0.8 149 | 150 | # current_obs = [] 151 | # for i in new_obs: 152 | # for j in i: 153 | # current_obs.append(j) 154 | 155 | # cnt = 0 156 | # for i in current_obs: 157 | # cnt += 1 158 | # print(' ', i, ' ', end='') 159 | # if cnt % self.rows == 0: 160 | # print('') 161 | # print('') 162 | 163 | # return_obs = np.array(current_obs) 164 | 165 | # print(new_obs) 166 | 167 | # time.sleep(10) 168 | 169 | return new_obs 170 | 171 | def draw_grid(self): 172 | size_btwn = W // self.rows 173 | 174 | x = 0 175 | y = 0 176 | 177 | for i in range(self.rows): 178 | x = x + size_btwn 179 | y = y + size_btwn 180 | 181 | pygame.draw.line(self.surf, (255, 255, 255), (x, 0), (x, W)) 182 | pygame.draw.line(self.surf, (255, 255, 255), (0, y), (W, y)) 183 | 184 | def redraw_window(self): 185 | if not self.draw: 186 | return 187 | 188 | self.surf.fill((0, 0, 0)) 189 | self.draw_grid() 190 | self.snake.draw(self.surf) 191 | self.snack.draw(self.surf) 192 | 193 | pygame.display.update() 194 | 195 | def random_snack(self): 196 | positions = self.snake.body 197 | 198 | while True: 199 | x = random.randrange(self.rows) 200 | y = random.randrange(self.rows) 201 | if len(list(filter(lambda z:z.pos == (x,y), positions))) > 0: 202 | continue 203 | else: 204 | break 205 | return (x,y) 206 | 207 | def countdown(self): 208 | if not self.draw or not self.animation: 209 | return 210 | for _ in range(3, 0, -1): 211 | self.write_text("Start in {}".format(_)) 212 | time.sleep(0.3) 213 | 214 | def game_over(self): 215 | self.is_done = True 216 | if not self.draw or not self.animation: 217 | return 218 | self.write_text("Score {}".format(self.reward)) 219 | time.sleep(1.5) 220 | 221 | def write_text(self, text): 222 | self.redraw_window() 223 | text_start = pygame.font.SysFont('dyuthi', 80). \ 224 | render(text, True, (255, 255, 255)) 225 | self.surf.blit(text_start, 226 | (text_start.get_width() // 227 | 2, text_start.get_height() // 2)) 228 | pygame.display.flip() 229 | 230 | def play_human(self): 231 | self.countdown() 232 | 233 | while(not self.is_done): 234 | pygame.time.delay(50) # lower is faster 235 | self.clock.tick(self.speed) # lower is slower 236 | 237 | self.snake.move_human() 238 | 239 | if self.snake.ate_itself(): 240 | self.game_over() 241 | 242 | if self.snake.body[0].pos == self.snack.pos: 243 | self.snake.add_cube() 244 | self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0)) 245 | self.reward += 1 246 | 247 | self.redraw_window() 248 | self.get_observation_space() 249 | 250 | 251 | if __name__ == "__main__": 252 | env = SnakeEnvironment(draw=True, speed=100, rows=5) 253 | env.play_human() 254 | 255 | 256 | 257 | 258 | ####### 259 | # if self.last_observation == None: 260 | # self.last_observation = current_obs 261 | 262 | # return_obs = [] 263 | 264 | # for i in self.last_observation: 265 | # return_obs.append(i) 266 | # for i in current_obs: 267 | # return_obs.append(i) 268 | 269 | # return_obs = np.array(return_obs) 270 | 271 | # cnt = 0 272 | # for i in return_obs: 273 | # cnt += 1 274 | # print(' ', i, ' ', end='') 275 | # if cnt % 10 == 0: 276 | # print('') 277 | # if cnt % 100 == 0: 278 | # print('') 279 | # print('') 280 | # print('') 281 | 282 | # self.last_observation = current_obs 283 | ####### 284 | -------------------------------------------------------------------------------- /better_snake/old_env_2/self_play.py: -------------------------------------------------------------------------------- 1 | # from environment.environment import SnakeEnvironment 2 | 3 | env = SnakeEnvironment(draw=True, speed=100000, rows=5) 4 | 5 | env.reset() 6 | terminal = False 7 | 8 | while not terminal: 9 | action = random.randint(0, 4) 10 | next_state, reward, is_done, _ = env.step(action) 11 | terminal = is_done 12 | -------------------------------------------------------------------------------- /better_snake/old_env_2/snake.py: -------------------------------------------------------------------------------- 1 | from environment.cube import Cube 2 | 3 | import pygame 4 | 5 | 6 | class Snake(object): 7 | 8 | body = [] 9 | turns = {} 10 | 11 | def __init__(self, color, pos, rows, w): 12 | self.head = Cube(pos, rows, w) 13 | self.body.append(self.head) 14 | 15 | self.rows = rows 16 | self.w = w 17 | 18 | self.color = color 19 | 20 | self.dirnx = 0 21 | self.dirny = 0 22 | 23 | self.add_cube() 24 | self.add_cube() 25 | 26 | def move_ai(self, action): 27 | x = self.head.pos[0] 28 | y = self.head.pos[1] 29 | 30 | if y == 0 and action == 0: 31 | action = -1 32 | elif x == self.rows -1 and action == 1: 33 | action = -1 34 | elif y == self.rows -1 and action == 2: 35 | action = -1 36 | elif x == 0 and action == 3: 37 | action = -1 38 | 39 | if action == -1: 40 | pass 41 | elif action == 0: 42 | self.dirnx = 0 43 | self.dirny = -1 44 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 45 | elif action == 1: 46 | self.dirnx = 1 47 | self.dirny = 0 48 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 49 | elif action == 2: 50 | self.dirnx = 0 51 | self.dirny = 1 52 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 53 | elif action == 3: 54 | self.dirnx = -1 55 | self.dirny = 0 56 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 57 | 58 | for i, c in enumerate(self.body): 59 | p = c.pos[:] 60 | if p in self.turns: 61 | turn = self.turns[p] 62 | c.move(turn[0],turn[1]) 63 | if i == len(self.body)-1: 64 | self.turns.pop(p) 65 | else: 66 | if c.dirnx == -1 and c.pos[0] <= 0: c.pos = (c.rows-1, c.pos[1]) 67 | elif c.dirnx == 1 and c.pos[0] >= c.rows-1: c.pos = (0,c.pos[1]) 68 | elif c.dirny == 1 and c.pos[1] >= c.rows-1: c.pos = (c.pos[0], 0) 69 | elif c.dirny == -1 and c.pos[1] <= 0: c.pos = (c.pos[0],c.rows-1) 70 | else: c.move(c.dirnx,c.dirny) 71 | 72 | def move_human(self): 73 | for event in pygame.event.get(): 74 | if event.type == pygame.QUIT: 75 | pygame.quit() 76 | 77 | keys = pygame.key.get_pressed() 78 | for key in keys: 79 | if keys[pygame.K_UP]: 80 | self.dirnx = 0 81 | self.dirny = -1 82 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 83 | elif keys[pygame.K_RIGHT]: 84 | self.dirnx = 1 85 | self.dirny = 0 86 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 87 | elif keys[pygame.K_DOWN]: 88 | self.dirnx = 0 89 | self.dirny = 1 90 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 91 | elif keys[pygame.K_LEFT]: 92 | self.dirnx = -1 93 | self.dirny = 0 94 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 95 | 96 | for i, c in enumerate(self.body): 97 | p = c.pos[:] 98 | if p in self.turns: 99 | turn = self.turns[p] 100 | c.move(turn[0],turn[1]) 101 | if i == len(self.body)-1: 102 | self.turns.pop(p) 103 | else: 104 | if c.dirnx == -1 and c.pos[0] <= 0: c.pos = (c.rows-1, c.pos[1]) 105 | elif c.dirnx == 1 and c.pos[0] >= c.rows-1: c.pos = (0,c.pos[1]) 106 | elif c.dirny == 1 and c.pos[1] >= c.rows-1: c.pos = (c.pos[0], 0) 107 | elif c.dirny == -1 and c.pos[1] <= 0: c.pos = (c.pos[0],c.rows-1) 108 | else: c.move(c.dirnx,c.dirny) 109 | 110 | def ate_itself(self): 111 | head = True 112 | for i, c in enumerate(self.body): 113 | if self.head.pos == c.pos and not head: 114 | return True 115 | head = False 116 | 117 | def reset(self, pos): 118 | self.head = Cube(pos, self.rows, self.w) 119 | self.body = [] 120 | self.body.append(self.head) 121 | self.turns = {} 122 | self.dirnx = 0 123 | self.dirny = 1 124 | self.add_cube() 125 | self.add_cube() 126 | 127 | def add_cube(self): 128 | tail = self.body[-1] 129 | dx, dy = tail.dirnx, tail.dirny 130 | 131 | if dx == 1 and dy == 0: 132 | self.body.append(Cube((tail.pos[0] -1, tail.pos[1]), self.rows, self.w)) 133 | elif dx == -1 and dy == 0: 134 | self.body.append(Cube((tail.pos[0] +1, tail.pos[1]), self.rows, self.w)) 135 | elif dx == 0 and dy == 1: 136 | self.body.append(Cube((tail.pos[0], tail.pos[1] -1), self.rows, self.w)) 137 | elif dx == 0 and dy == -1: 138 | self.body.append(Cube((tail.pos[0], tail.pos[1] +1), self.rows, self.w)) 139 | 140 | self.body[-1].dirnx = dx 141 | self.body[-1].dirny = dy 142 | 143 | def draw(self, surface): 144 | for i, c in enumerate(self.body): 145 | if i == 0: 146 | c.draw(surface, True) 147 | else: 148 | c.draw(surface) 149 | -------------------------------------------------------------------------------- /better_snake/ppo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import gym_snake 3 | 4 | from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv 5 | from stable_baselines.common.policies import MlpPolicy 6 | from stable_baselines import PPO1 7 | 8 | 9 | if __name__ == "__main__": 10 | # env = gym.make('snake-v0') 11 | env = gym.make('snake-v0') 12 | # env = SubprocVecEnv([lambda: env]) 13 | env = DummyVecEnv([lambda: env]) 14 | 15 | model = PPO1(MlpPolicy, env, verbose=1) 16 | 17 | model.learn(total_timesteps=500000) 18 | model.save('models/snake-bastard') 19 | 20 | ############################################################################### 21 | 22 | # env = gym.make('snake-v0') 23 | # # env = DummyVecEnv([lambda: env]) 24 | 25 | # # model = PPO2(MlpPolicy, env, verbose=1) 26 | # # model.load('models/snake-basterd') 27 | 28 | # obs = env.reset() 29 | # is_done = False 30 | 31 | # while not is_done: 32 | # action, _states = model.predict(obs) 33 | # obs, rewards, terminal, info = env.step(action) 34 | # is_done = terminal 35 | # env.render() 36 | -------------------------------------------------------------------------------- /flappyb/dqn_rainbow.py: -------------------------------------------------------------------------------- 1 | from environment.environment import Environment 2 | import ptan 3 | import argparse 4 | import numpy as np 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | import torch.optim as optim 11 | 12 | from tensorboardX import SummaryWriter 13 | 14 | from lib import dqn_model 15 | from lib import common 16 | 17 | 18 | MODEL_NAME = "flappyb-test-the-rainbow" 19 | NUMBER_NEURONS = 512 20 | WRITE = False 21 | 22 | # n-step 23 | REWARD_STEPS = 2 24 | 25 | # priority replay 26 | PRIO_REPLAY_ALPHA = 0.6 27 | BETA_START = 0.4 28 | BETA_FRAMES = 100000 29 | 30 | # C51 31 | Vmax = 10 32 | Vmin = -10 33 | N_ATOMS = 51 34 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 35 | 36 | 37 | class RainbowDQN(nn.Module): 38 | def __init__(self, input_shape, n_actions): 39 | super(RainbowDQN, self).__init__() 40 | 41 | self.fc_val = nn.Sequential( 42 | dqn_model.NoisyLinear(input_shape, NUMBER_NEURONS), 43 | nn.ReLU(), 44 | dqn_model.NoisyLinear(NUMBER_NEURONS, N_ATOMS) 45 | ) 46 | 47 | self.fc_adv = nn.Sequential( 48 | dqn_model.NoisyLinear(input_shape, NUMBER_NEURONS), 49 | nn.ReLU(), 50 | dqn_model.NoisyLinear(NUMBER_NEURONS, n_actions * N_ATOMS) 51 | ) 52 | 53 | self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)) 54 | self.softmax = nn.Softmax(dim=1) 55 | 56 | def forward(self, x): 57 | batch_size = x.size()[0] 58 | fx = x.float() / NUMBER_NEURONS 59 | val_out = self.fc_val(fx).view(batch_size, 1, N_ATOMS) 60 | adv_out = self.fc_adv(fx).view(batch_size, -1, N_ATOMS) 61 | adv_mean = adv_out.mean(dim=1, keepdim=True) 62 | return val_out + (adv_out - adv_mean) 63 | 64 | def both(self, x): 65 | cat_out = self(x) 66 | probs = self.apply_softmax(cat_out) 67 | weights = probs * self.supports 68 | res = weights.sum(dim=2) 69 | return cat_out, res 70 | 71 | def qvals(self, x): 72 | return self.both(x)[1] 73 | 74 | def apply_softmax(self, t): 75 | return self.softmax(t.view(-1, N_ATOMS)).view(t.size()) 76 | 77 | 78 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): 79 | states, actions, rewards, dones, next_states = common.unpack_batch(batch) 80 | batch_size = len(batch) 81 | 82 | states_v = torch.tensor(states).to(device) 83 | actions_v = torch.tensor(actions).to(device) 84 | next_states_v = torch.tensor(next_states).to(device) 85 | batch_weights_v = torch.tensor(batch_weights).to(device) 86 | 87 | # next state distribution 88 | # dueling arch -- actions from main net, distr from tgt_net 89 | 90 | # calc at once both next and cur states 91 | distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) 92 | next_qvals_v = qvals_v[batch_size:] 93 | distr_v = distr_v[:batch_size] 94 | 95 | next_actions_v = next_qvals_v.max(1)[1] 96 | next_distr_v = tgt_net(next_states_v) 97 | next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] 98 | next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) 99 | next_best_distr = next_best_distr_v.data.cpu().numpy() 100 | 101 | dones = dones.astype(np.bool) 102 | 103 | # project our distribution using Bellman update 104 | proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) 105 | 106 | # calculate net output 107 | state_action_values = distr_v[range(batch_size), actions_v.data] 108 | state_log_sm_v = F.log_softmax(state_action_values, dim=1) 109 | proj_distr_v = torch.tensor(proj_distr).to(device) 110 | 111 | loss_v = -state_log_sm_v * proj_distr_v 112 | loss_v = batch_weights_v * loss_v.sum(dim=1) 113 | return loss_v.mean(), loss_v + 1e-5 114 | 115 | 116 | if __name__ == "__main__": 117 | params = common.HYPERPARAMS['flappyb'] 118 | params['epsilon_frames'] *= 2 119 | parser = argparse.ArgumentParser() 120 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 121 | args = parser.parse_args() 122 | device = torch.device("cuda" if args.cuda else "cpu") 123 | 124 | env = Environment(draw=False, fps=1, debug=False, 125 | dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True) 126 | writer = None 127 | if WRITE: 128 | writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow") 129 | net = RainbowDQN(env.observation_space.n, env.action_space.n).to(device) 130 | tgt_net = ptan.agent.TargetNet(net) 131 | agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device) 132 | 133 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS) 134 | buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) 135 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 136 | 137 | frame_idx = 0 138 | beta = BETA_START 139 | 140 | with common.RewardTracker(MODEL_NAME, net, writer, params['stop_reward']) as reward_tracker: 141 | while True: 142 | frame_idx += 1 143 | buffer.populate(1) 144 | beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) 145 | 146 | new_rewards = exp_source.pop_total_rewards() 147 | if new_rewards: 148 | if reward_tracker.reward(new_rewards[0], frame_idx): 149 | break 150 | 151 | if len(buffer) < params['replay_initial']: 152 | continue 153 | 154 | optimizer.zero_grad() 155 | batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta) 156 | loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model, 157 | params['gamma'] ** REWARD_STEPS, device=device) 158 | loss_v.backward() 159 | optimizer.step() 160 | buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy()) 161 | 162 | if frame_idx % params['target_net_sync'] == 0: 163 | tgt_net.sync() 164 | -------------------------------------------------------------------------------- /flappyb/dqn_v2.py: -------------------------------------------------------------------------------- 1 | # Made with the help of 2 | # https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288 3 | 4 | import random 5 | import numpy as np 6 | from collections import deque 7 | 8 | import keras 9 | from keras.models import Sequential 10 | from keras.layers import Dense 11 | from keras.optimizers import Adam 12 | from keras.models import load_model 13 | 14 | from environment.environment import Environment 15 | 16 | # from tensorboardX import SummaryWriter 17 | 18 | GAMMA = 0.9 # try .99 19 | LEARNING_RATE = 0.001 # default is 0.001 20 | LEARNING_WITH_DECAY = 0.01 21 | 22 | MEMORY_SIZE = 1000000 23 | BATCH_SIZE = 32 24 | 25 | EXPLORATION_MAX = 0.5 26 | EXPLORATION_MIN = 0.01 27 | EXPLORATION_DECAY = 0.99995 28 | 29 | # PARAMETERS ################################################################## 30 | LEARN = True # False if using a trained model 31 | 32 | NAME = 'dqn-loadedHARDCORE=6300-grav=2.5-dist-pipes=220-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-nextPipe' 33 | WRITE = False # Only for training 34 | DRAW = False # Only for training 35 | SAVE_MODEL = True # Only for training 36 | 37 | OBS_THIS_PIPE_PLAY = False # False for HELL, changes observation 38 | OBS_THIS_PIPE_LEARN = False 39 | DIFFICULTY_PLAY = 45 # 160 is easy, 70 is hardcore, 45 is hell 40 | DIFFICULTY_LEARN = 45 41 | 42 | DIST_BETWEEN_PIPES = 220 # default is 220 43 | 44 | # Here you can load trained models: 45 | 46 | # LOAD_NAME = 'dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=6650' # 950 is oke # 1050 is oke # 6650 My baby is back <3 # 2600 is pretty good # 6300 is god 47 | # LOAD_NAME = 'dqn-expdecay=0.999995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-HARDCORE-PART=6300' 48 | LOAD_NAME = 'dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-LOADED=HARDCORE-6300-lrMax=0.4-nextPipe-HELL-PART=1000' # NEXT_PIPE # 1000 is really good! 49 | ##################################################################################################### 50 | 51 | 52 | class DQNSolver: 53 | 54 | def __init__(self, observation_space, action_space, model=None): 55 | self.exploration_rate = EXPLORATION_MAX 56 | 57 | self.action_space = action_space 58 | self.memory = deque(maxlen=MEMORY_SIZE) 59 | 60 | if model is None: 61 | print('new model') 62 | self.model = Sequential() 63 | # andere aktivierungs funktion 64 | self.model.add(Dense(512, input_shape=( 65 | observation_space,), activation="relu")) 66 | self.model.add(Dense(512, activation="relu")) 67 | # self.model.add(Dropout(0.85)) 68 | # self.model.add(Dense(512, activation="relu")) 69 | # Linear sucks? maybe try softmax 70 | self.model.add(Dense(self.action_space, activation="linear")) 71 | self.model.compile(loss="mse", optimizer=Adam( 72 | lr=LEARNING_RATE)) # Try learning rate deacy 73 | # self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_WITH_DECAY, decay=1e-6)) 74 | else: 75 | print('saved model loaded') 76 | self.model = model 77 | 78 | def remember(self, state, action, reward, next_state, done): 79 | self.memory.append((state, action, reward, next_state, done)) 80 | 81 | def act(self, state, env): 82 | if np.random.rand() < self.exploration_rate: 83 | return env.get_action_random() 84 | q_values = self.model.predict(state) 85 | return np.argmax(q_values[0]) 86 | 87 | def act_free(self, state): 88 | q_values = self.model.predict(state) 89 | return np.argmax(q_values[0]) 90 | 91 | def experience_replay(self): 92 | if len(self.memory) < BATCH_SIZE: 93 | return 94 | batch = random.sample(self.memory, BATCH_SIZE) 95 | for state, action, reward, state_next, terminal in batch: 96 | q_update = reward 97 | if not terminal: 98 | q_update = (reward + GAMMA * 99 | np.amax(self.model.predict(state_next)[0])) 100 | q_values = self.model.predict(state) 101 | q_values[0][action] = q_update 102 | self.model.fit(state, q_values, verbose=0) 103 | self.exploration_rate *= EXPLORATION_DECAY 104 | self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate) 105 | 106 | 107 | def learn_flappyb(): 108 | env = Environment(draw=DRAW, fps=1, debug=False, 109 | dist_to_pipe=DIFFICULTY_LEARN, 110 | dist_between_pipes=DIST_BETWEEN_PIPES, 111 | obs_this_pipe=OBS_THIS_PIPE_LEARN) 112 | writer = None 113 | if WRITE: 114 | writer = SummaryWriter(comment=NAME) 115 | observation_space = env.get_observation_size_buffer() 116 | action_space = env.get_action_size() 117 | 118 | model = load_model('models/dqn/{}.h5'.format(LOAD_NAME)) 119 | dqn_solver = DQNSolver(observation_space, action_space, model) 120 | run = 0 121 | 122 | if SAVE_MODEL: 123 | name = '{}-PART={}'.format(NAME, run) 124 | dqn_solver.model.save('models/dqn/{}.h5'.format(name)) 125 | while True: 126 | run += 1 127 | state = env.reset() 128 | state = np.reshape(state, [1, observation_space]) 129 | step = 0 130 | reward_score = 0 131 | 132 | while True: 133 | step += 1 134 | action = dqn_solver.act(state, env) 135 | state_next, reward, terminal, info = env.step_buffer(action) 136 | reward_score += reward 137 | state_next = np.reshape(state_next, [1, observation_space]) 138 | dqn_solver.remember(state, action, reward, state_next, terminal) 139 | state = state_next 140 | if terminal: 141 | print("Run: " + str(run) + ", exploration: " + 142 | str(dqn_solver.exploration_rate) + ", score: " + 143 | str(reward_score)) 144 | if WRITE: 145 | writer.add_scalar("reward", reward_score, run) 146 | break 147 | dqn_solver.experience_replay() 148 | if (run % 100 == 0) and SAVE_MODEL: 149 | name = '{}-PART={}'.format(NAME, run) 150 | dqn_solver.model.save('models/dqn/{}.h5'.format(name)) 151 | if WRITE: 152 | writer.close() 153 | 154 | 155 | def play_flappyb(): 156 | env = Environment(draw=True, fps=1, debug=True, 157 | dist_to_pipe=DIFFICULTY_PLAY, 158 | dist_between_pipes=DIST_BETWEEN_PIPES, 159 | obs_this_pipe=OBS_THIS_PIPE_PLAY) 160 | 161 | observation_space = env.get_observation_size_buffer() 162 | action_space = env.get_action_size() 163 | 164 | model = keras.models.load_model('models/dqn/{}.h5'.format(LOAD_NAME)) 165 | dqn_solver = DQNSolver(observation_space, action_space, model) 166 | 167 | for i in range(20): 168 | state = env.reset() 169 | state = np.reshape(state, [1, observation_space]) 170 | is_done = False 171 | while not is_done: 172 | action = dqn_solver.act_free(state) 173 | # action = env.get_action_random() 174 | state_next, reward, terminal, info = env.step_buffer(action) 175 | is_done = terminal 176 | state = np.reshape(state_next, [1, observation_space]) 177 | 178 | 179 | if __name__ == "__main__": 180 | if LEARN: 181 | learn_flappyb() 182 | else: 183 | play_flappyb() 184 | 185 | print('Jobe Done!') 186 | -------------------------------------------------------------------------------- /flappyb/dqn_v3.py: -------------------------------------------------------------------------------- 1 | from environment.environment import Environment 2 | import time 3 | import numpy as np 4 | import collections 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | from tensorboardX import SummaryWriter 10 | 11 | NAME = 'flappyb-newdqn' 12 | GPU = False 13 | 14 | MEAN_REWARD_BOUND = 500000 15 | 16 | GAMMA = 0.99 17 | BATCH_SIZE = 32 18 | REPLAY_SIZE = 10000 19 | LEARNING_RATE = 1e-4 20 | SYNC_TARGET_FRAMES = 1000 21 | REPLAY_START_SIZE = 10000 22 | 23 | EPSILON_DECAY_LAST_FRAME = 10**5 24 | EPSILON_START = 1.0 25 | EPSILON_FINAL = 0.02 26 | 27 | 28 | class DQN(nn.Module): 29 | def __init__(self, input_shape, n_actions): 30 | super(DQN, self).__init__() 31 | 32 | self.fc = nn.Sequential( 33 | nn.Linear(input_shape, 512), 34 | nn.ReLU(), 35 | nn.Linear(512, n_actions) 36 | ) 37 | 38 | def forward(self, x): 39 | return self.fc(x) 40 | 41 | 42 | Experience = collections.namedtuple('Experience', field_names=[ 43 | 'state', 'action', 'reward', 'done', 'new_state']) 44 | 45 | 46 | class ExperienceBuffer: 47 | def __init__(self, capacity): 48 | self.buffer = collections.deque(maxlen=capacity) 49 | 50 | def __len__(self): 51 | return len(self.buffer) 52 | 53 | def append(self, experience): 54 | self.buffer.append(experience) 55 | 56 | def sample(self, batch_size): 57 | indices = np.random.choice(len(self.buffer), batch_size, replace=True) 58 | states, actions, rewards, dones, next_states = \ 59 | zip(*[self.buffer[idx] for idx in indices]) 60 | return np.array(states), np.array(actions), np.array( 61 | rewards, dtype=np.float32), np.array(dones, dtype=np.uint8), \ 62 | np.array(next_states) 63 | 64 | 65 | class Agent: 66 | def __init__(self, env, exp_buffer): 67 | self.env = env 68 | self.exp_buffer = exp_buffer 69 | self._reset() 70 | 71 | def _reset(self): 72 | self.state = env.reset() 73 | self.total_reward = 0.0 74 | 75 | def play_step(self, net, epsilon=0.0, device="cpu"): 76 | done_reward = None 77 | 78 | if np.random.random() < epsilon: 79 | action = env.get_action_random() 80 | else: 81 | state_a = np.array([self.state], copy=False) 82 | state_v = torch.tensor(state_a[0]).to(device) 83 | q_vals_v = net(state_v.float()) 84 | _, act_v = torch.max(q_vals_v, dim=1) 85 | action = int(act_v.item()) 86 | 87 | # do step in the environment 88 | new_state, reward, is_done, _ = self.env.step_buffer(action) 89 | self.total_reward += reward 90 | 91 | exp = Experience(self.state, action, reward, is_done, new_state) 92 | self.exp_buffer.append(exp) 93 | self.state = new_state 94 | if is_done: 95 | done_reward = self.total_reward 96 | self._reset() 97 | return done_reward 98 | 99 | 100 | def calc_loss(batch, net, tgt_net, device="cpu"): 101 | states, actions, rewards, dones, next_states = batch 102 | 103 | states_v = torch.tensor(states).to(device) 104 | next_states_v = torch.tensor(next_states).to(device) 105 | actions_v = torch.tensor(actions).to(device) 106 | rewards_v = torch.tensor(rewards).to(device) 107 | done_mask = torch.ByteTensor(dones).to(device) 108 | 109 | state_action_values = net(states_v.float()).gather( 110 | 1, actions_v.unsqueeze(-1)).squeeze(-1) 111 | next_state_values = tgt_net(next_states_v.float()).max(1)[0] 112 | next_state_values[done_mask] = 0.0 113 | next_state_values = next_state_values.detach() 114 | 115 | expected_state_action_values = next_state_values * GAMMA + rewards_v 116 | 117 | return nn.MSELoss()(state_action_values, expected_state_action_values) 118 | 119 | 120 | if __name__ == "__main__": 121 | device = torch.device("cuda" if GPU else "cpu") 122 | 123 | env = Environment(draw=False, fps=1, debug=False, 124 | dist_to_pipe=50, dist_between_pipes=180, 125 | obs_this_pipe=False) 126 | 127 | net = DQN(env.get_observation_size(), env.get_action_size()).to(device) 128 | tgt_net = DQN(env.get_observation_size(), env.get_action_size()).to(device) 129 | writer = SummaryWriter(comment="-" + NAME) 130 | print(net) 131 | 132 | buffer = ExperienceBuffer(REPLAY_SIZE) 133 | agent = Agent(env, buffer) 134 | epsilon = EPSILON_START 135 | 136 | optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) 137 | total_rewards = [] 138 | frame_idx = 0 139 | ts_frame = 0 140 | ts = time.time() 141 | best_mean_reward = None 142 | 143 | while True: 144 | frame_idx += 1 145 | epsilon = max(EPSILON_FINAL, 146 | EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME) 147 | 148 | reward = agent.play_step(net, epsilon, device=device) 149 | if reward is not None: 150 | total_rewards.append(reward) 151 | speed = (frame_idx - ts_frame) / (time.time() - ts) 152 | ts_frame = frame_idx 153 | ts = time.time() 154 | mean_reward = np.mean(total_rewards[-100:]) 155 | print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % ( 156 | frame_idx, len(total_rewards), mean_reward, epsilon, 157 | speed 158 | )) 159 | writer.add_scalar("epsilon", epsilon, frame_idx) 160 | writer.add_scalar("speed", speed, frame_idx) 161 | writer.add_scalar("reward_100", mean_reward, frame_idx) 162 | writer.add_scalar("reward", reward, frame_idx) 163 | 164 | if best_mean_reward is None or best_mean_reward < mean_reward: 165 | torch.save(net.state_dict(), NAME + "-best.dat") 166 | if best_mean_reward is not None: 167 | print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward)) 168 | best_mean_reward = mean_reward 169 | if mean_reward > MEAN_REWARD_BOUND: 170 | print("Solved in %d frames!" % frame_idx) 171 | break 172 | 173 | if len(buffer) < REPLAY_START_SIZE: 174 | continue 175 | 176 | if frame_idx % SYNC_TARGET_FRAMES == 0: 177 | tgt_net.load_state_dict(net.state_dict()) 178 | 179 | optimizer.zero_grad() 180 | batch = buffer.sample(BATCH_SIZE) 181 | loss_t = calc_loss(batch, net, tgt_net, device=device) 182 | loss_t.backward() 183 | optimizer.step() 184 | writer.close() 185 | -------------------------------------------------------------------------------- /flappyb/environment/assets/Pong-653x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/Pong-653x400.png -------------------------------------------------------------------------------- /flappyb/environment/assets/all_fonts_script.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | import time 3 | 4 | pygame.init() 5 | pygame.display.set_caption('display all fonts') 6 | screen = pygame.display.set_mode((1800, 1200)) 7 | 8 | y = 150 9 | x = 0 10 | mod = 0 11 | 12 | for font in pygame.font.get_fonts(): 13 | if font == 'notocoloremoji': 14 | continue 15 | if font == 'kacstoffice': 16 | break 17 | 18 | if mod == 0: 19 | x = 200 20 | if mod == 1: 21 | x = 600 22 | if mod == 2: 23 | x = 1100 24 | if mod == 3: 25 | x = 1600 26 | y+=25 27 | mod = -1 28 | mod += 1 29 | 30 | text = pygame.font.SysFont(font, 17).render("{}: GAME 123 !".format(font), True, (255,255,255)) 31 | screen.blit(text, (x - text.get_width() // 2, y - text.get_height() // 2)) 32 | 33 | 34 | 35 | 36 | pygame.display.flip() 37 | time.sleep(200) 38 | -------------------------------------------------------------------------------- /flappyb/environment/assets/bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/bg.png -------------------------------------------------------------------------------- /flappyb/environment/assets/bird.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/bird.png -------------------------------------------------------------------------------- /flappyb/environment/assets/pipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/pipe.png -------------------------------------------------------------------------------- /flappyb/environment/assets/pipe_long.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/pipe_long.png -------------------------------------------------------------------------------- /flappyb/environment/assets/sapcraft.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/sapcraft.jpg -------------------------------------------------------------------------------- /flappyb/environment/bird.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | 3 | 4 | class Bird(): 5 | 6 | def __init__(self, screen, s_width, s_height, color): 7 | 8 | self.bird_image = pygame.image.load("environment/assets/bird.png") 9 | self.rotate = 1 10 | 11 | self.screen = screen 12 | self.s_width = s_width 13 | self.s_height = s_height 14 | self.color = color 15 | 16 | self.radius = 20 17 | # self.radius = 10 18 | 19 | self.x = 50 20 | self.y = int(s_height / 2) 21 | 22 | self.vel = 0 23 | self.gravity = 2 # default is 1 24 | 25 | self.bottom = s_height - 20 26 | self.vel_cap = 20 27 | 28 | self.salto = False 29 | self.rotation = 0 30 | self.last_rotation = 0 31 | self.last_reward = 0 32 | 33 | def handle_events_human(self): 34 | keys = pygame.key.get_pressed() 35 | if keys[pygame.K_SPACE]: 36 | self._fly() 37 | 38 | def handle_events_ai(self, action): 39 | if action == 1: 40 | self._fly() 41 | else: 42 | pass 43 | 44 | def draw(self, reward): 45 | 46 | surf = None 47 | 48 | if reward % 10 == 0 and reward is not self.last_reward: 49 | self.last_reward = reward 50 | self.salto = True 51 | 52 | if self.salto: 53 | if self.last_rotation >= 0: 54 | self.rotation += 15 55 | surf = pygame.transform.rotate(self.bird_image, self.rotation) 56 | if self.rotation == 400: 57 | self.salto = False 58 | else: 59 | self.rotation -= 15 60 | surf = pygame.transform.rotate(self.bird_image, self.rotation) 61 | if self.rotation == -400: 62 | self.salto = False 63 | 64 | elif self.vel > 0: 65 | self.rotation = -40 66 | self.last_rotation = self.rotation 67 | surf = pygame.transform.rotate(self.bird_image, self.rotation) 68 | 69 | else: 70 | self.rotation = 40 71 | self.last_rotation = self.rotation 72 | surf = pygame.transform.rotate(self.bird_image, self.rotation) 73 | 74 | self.screen.blit(surf, (self.x - 25, self.y - 20)) 75 | 76 | def update(self): 77 | self.vel += self.gravity 78 | self.y += self.vel 79 | 80 | if self.y > self.s_height: 81 | return True 82 | 83 | if self.y < 0: 84 | self.y = 0 85 | 86 | if self.vel > 20: 87 | self.vel = 20 88 | 89 | return False 90 | 91 | def _fly(self): 92 | self.vel += -self.gravity * 2 93 | 94 | if self.vel > 20: 95 | self.vel = 20 96 | -------------------------------------------------------------------------------- /flappyb/environment/environment.py: -------------------------------------------------------------------------------- 1 | # Game was made with the help of https://www.youtube.com/watch?v=cXgA1d_E-jY 2 | import time 3 | import pygame 4 | import gym 5 | import gym.spaces 6 | import enum 7 | 8 | import numpy as np 9 | 10 | from environment.bird import Bird 11 | from environment.pipe import Pipe 12 | 13 | # AI PARAMETERS ########################################################## 14 | BUFFER_SIZE = 4 15 | OBSERVATION_SIZE = 5 16 | ACTIONS = [0, 1] 17 | ACTION_SIZE = 2 18 | ROUND_TO_DECIMALS = 2 19 | 20 | # GAME PARAMETERS ######################################################## 21 | SCREEN_SIZE = WIDTH, HEIGHT = (640, 880) 22 | BACKGROUND = (146, 183, 254) 23 | BIRD_COLOR = (241, 213, 19) 24 | PIPE_COLOR = (44, 176, 26) 25 | FONT = 'dyuthi' 26 | 27 | 28 | """ 29 | Interace: 30 | reset(): resets the whole environment 31 | step(action): performs one action onto the environment 32 | step_buffer(action): performs one action on the environment, 33 | returns 4 states 34 | get_action_random(): obtain an imporoved random action 35 | get_observation_size(): obtain size of observation 36 | get_action_size(): obtain size of action 37 | """ 38 | 39 | 40 | class Actions(enum.Enum): 41 | Skip = 0 42 | Fly = 1 43 | 44 | 45 | class Environment(gym.Env): 46 | 47 | def __init__(self, draw=True, fps=10, debug=False, 48 | dist_to_pipe=150, dist_between_pipes=220, obs_this_pipe=True): 49 | 50 | super(Environment, self).__init__() 51 | self.observation_space = gym.spaces.Discrete(n=OBSERVATION_SIZE * BUFFER_SIZE) 52 | self.action_space = gym.spaces.Discrete(n=len(Actions)) 53 | 54 | self.pipe_image_up = None 55 | self.pipe_image_down = None 56 | 57 | if draw: 58 | pygame.init() 59 | pygame.display.set_caption('NN FlappyB') 60 | 61 | self.font_game_over = pygame.font.SysFont("ani", 72) 62 | self.bg = pygame.image.load("environment/assets/bg.png") 63 | 64 | self.pipe_image_up = pygame.image.load( 65 | "environment/assets/pipe.png") # 52x808 66 | self.pipe_image_down = pygame.image.load( 67 | "environment/assets/pipe_long.png") # 52x808< 68 | 69 | self.dist_between_pipes = dist_between_pipes 70 | 71 | self.fps = fps 72 | self.debug = debug 73 | self.draw = draw 74 | self.dist_to_pipe = dist_to_pipe 75 | self.obs_this_pipe = obs_this_pipe 76 | 77 | self.clock = pygame.time.Clock() 78 | self.time_elapsed_since_last_action = 0 79 | self.global_time = 0 80 | 81 | self.screen = pygame.display.set_mode(SCREEN_SIZE) 82 | 83 | self.bird = Bird(self.screen, WIDTH, HEIGHT, BIRD_COLOR) 84 | self.pipes = [Pipe(self.screen, WIDTH, HEIGHT, 85 | PIPE_COLOR, self.dist_between_pipes, 86 | self.pipe_image_up, self.pipe_image_down)] 87 | 88 | self.reward = 0 89 | self.is_done = False 90 | self.printed_score = False 91 | 92 | # ML INTERFACE ########################################################### 93 | def reset(self): 94 | 95 | self.clock = pygame.time.Clock() 96 | self.time_elapsed_since_last_action = 0 97 | self.global_time = 0 98 | 99 | self.bird = Bird(self.screen, WIDTH, HEIGHT, BIRD_COLOR) 100 | self.pipes = [Pipe(self.screen, WIDTH, HEIGHT, 101 | PIPE_COLOR, self.dist_between_pipes, 102 | self.pipe_image_up, self.pipe_image_down)] 103 | 104 | self.reward = 0 105 | self.is_done = False 106 | self.printed_score = False 107 | 108 | obs, reward, is_done, _ = self.step(0) 109 | 110 | return obs 111 | 112 | # def step(self, action): 113 | 114 | # while not self.time_elapsed_since_last_action > self.fps: 115 | # dt = self.clock.tick() 116 | # self.time_elapsed_since_last_action += dt 117 | 118 | # self.global_time += 1 119 | 120 | # obs, rew, d, _ = self.run_ai_game_step(action) 121 | 122 | # if rew >= 1: 123 | # rew = 0.25 124 | # elif rew <= -1: 125 | # rew = -0.25 126 | # else: 127 | # rew = 0.025 128 | 129 | # return obs, rew, d, _ 130 | 131 | def step(self, action): 132 | 133 | if isinstance(action, np.ndarray): 134 | if action[0] > action[1]: 135 | action = 0 136 | else: 137 | action = 1 138 | 139 | obs = [] 140 | rew = 0 141 | 142 | for i in range(BUFFER_SIZE): 143 | while not self.time_elapsed_since_last_action > self.fps: 144 | dt = self.clock.tick() 145 | self.time_elapsed_since_last_action += dt 146 | 147 | self.global_time += 1 148 | o, r, d, _ = self.run_ai_game_step(action) 149 | rew += r 150 | 151 | for j in range(len(o)): 152 | obs.append(o[j]) 153 | 154 | if rew > 1: 155 | rew = 1 156 | elif rew < -1: 157 | rew = -1 158 | else: 159 | rew = 0.1 160 | 161 | obs = np.array(obs) 162 | 163 | return obs, rew, d, _ 164 | 165 | # The actual game step ################################################### 166 | def run_ai_game_step(self, action): 167 | 168 | current_reward = 0.1 169 | 170 | if self.global_time % self.dist_to_pipe == 0: 171 | self.pipes.append(Pipe(self.screen, WIDTH, HEIGHT, 172 | PIPE_COLOR, self.dist_between_pipes, 173 | self.pipe_image_up, self.pipe_image_down)) 174 | 175 | for pipe in self.pipes: 176 | pipe.update() 177 | 178 | if pipe.off_screen(): 179 | self.pipes.remove(pipe) 180 | 181 | if pipe.hits(self.bird): 182 | self.game_over() 183 | current_reward = -1 184 | # hit_pipe = True 185 | 186 | if pipe.behind(self.bird): 187 | self.reward += 1 188 | current_reward = 1 189 | 190 | self.bird.handle_events_ai(action) 191 | if self.bird.update(): 192 | self.game_over() 193 | current_reward = -1 194 | 195 | if self.draw: 196 | # self.screen.fill(BACKGROUND) 197 | self.screen.blit(self.bg, (0, 0)) 198 | for pipe in self.pipes: 199 | pipe.draw() 200 | self.bird.draw(self.reward) 201 | text = pygame.font.SysFont(FONT, 28).render( 202 | "SCORE {}".format(self.reward), True, (0, 0, 0)) 203 | self.screen.blit(text, (565 - text.get_width() // 204 | 2, 30 - text.get_height() // 2)) 205 | pygame.display.flip() 206 | 207 | obs = self.get_observation_space() 208 | 209 | if self.draw: 210 | pygame.display.update() 211 | 212 | self.time_elapsed_since_last_action = 0 213 | 214 | return obs, current_reward, self.is_done, None 215 | ########################################################################## 216 | 217 | def get_observation_space(self): 218 | 219 | my_pipe = Pipe(self.screen, WIDTH, HEIGHT, PIPE_COLOR, 220, None, None) 220 | my_pipe.x = 9999 221 | 222 | if self.obs_this_pipe: 223 | for pipe in self.pipes: 224 | if (pipe.x < my_pipe.x) and pipe.x >= (self.bird.x - pipe.width): 225 | my_pipe = pipe 226 | else: 227 | for pipe in self.pipes: 228 | if (pipe.x < my_pipe.x) and pipe.x >= (self.bird.x): # target next pipe immediately 229 | my_pipe = pipe 230 | 231 | e1 = self.bird.y # bird pos 232 | e2 = self.bird.vel # bird vel 233 | e3 = my_pipe.x - self.bird.x # dist to Pipe 234 | e4 = my_pipe.top # pipe top 235 | e5 = my_pipe.bot # pipe bot 236 | 237 | if self.draw and self.debug: 238 | e_d1 = pygame.rect.Rect(self.bird.x, e1, 2, HEIGHT - e1) 239 | pygame.draw.rect(self.screen, (255, 0, 0), e_d1) 240 | 241 | e_d2 = pygame.rect.Rect(self.bird.x - self.bird.radius, 242 | e2 * 2 + HEIGHT / 2, self.bird.x + self.bird.radius, 5) 243 | pygame.draw.rect(self.screen, (255, 0, 0), e_d2) 244 | 245 | e_d3 = pygame.rect.Rect(self.bird.x, self.bird.y, e3, 2) 246 | pygame.draw.rect(self.screen, (255, 0, 0), e_d3) 247 | 248 | e_d4 = pygame.rect.Rect(my_pipe.x - 5, e4, my_pipe.width + 10, 5) 249 | pygame.draw.rect(self.screen, (255, 0, 0), e_d4) 250 | 251 | e_d5 = pygame.rect.Rect(my_pipe.x - 5, e5, my_pipe.width + 10, 5) 252 | pygame.draw.rect(self.screen, (255, 0, 0), e_d5) 253 | 254 | # Normalization ### 255 | e1 = e1 / HEIGHT 256 | e2 = e2 / self.bird.vel_cap 257 | e3 = e3 / (WIDTH - 50) 258 | e4 = e4 / HEIGHT 259 | e5 = e5 / HEIGHT 260 | 261 | # Nomralizatoin with rounding 262 | # e1 = round(e1, ROUND_TO_DECIMALS) 263 | # e2 = round(e2, ROUND_TO_DECIMALS) 264 | # e3 = round(e3, ROUND_TO_DECIMALS) 265 | # e4 = round(e4, ROUND_TO_DECIMALS) 266 | # e5 = round(e5, ROUND_TO_DECIMALS) 267 | 268 | obs = (e1, e2, e3, e4, e5) 269 | # print(obs) 270 | 271 | return obs 272 | 273 | def get_action_random(self): 274 | action = np.random.choice((0, 1), 1, p=(0.45, 0.55)) 275 | return action.item(0) 276 | 277 | def get_observation_size(self): 278 | return OBSERVATION_SIZE 279 | 280 | def get_observation_size_buffer(self): 281 | return OBSERVATION_SIZE * BUFFER_SIZE 282 | 283 | def get_actions(self): 284 | return ACTIONS 285 | 286 | def get_action_size(self): 287 | return ACTION_SIZE 288 | 289 | def game_over(self): 290 | if not self.printed_score: 291 | # print('Score: {}'.format(self.reward)) 292 | self.printed_score = True 293 | 294 | if self.draw: 295 | text = pygame.font.SysFont(FONT, 28).render( 296 | "Game Over!".format(self.reward), True, (0, 0, 0)) 297 | self.screen.blit(text, (320 - text.get_width() // 298 | 2, 240 - text.get_height() // 2)) 299 | pygame.display.flip() 300 | time.sleep(0.4) 301 | self.is_done = True 302 | 303 | # HUMAN STUFF ################################################ 304 | 305 | def run_human_game(self): 306 | 307 | if self.draw: 308 | for _ in range(3,0,-1): 309 | self.screen.blit(self.bg, (0, 0)) 310 | self.bird.draw(self.reward) 311 | text_start = pygame.font.SysFont(FONT, 80).render( 312 | "Start in {}".format(_), True, (0, 0, 0)) 313 | self.screen.blit(text_start, (text_start.get_width() // 314 | 2, text_start.get_height() // 2)) 315 | pygame.display.flip() 316 | time.sleep(0.3) 317 | 318 | while not self.is_done: 319 | 320 | while not self.time_elapsed_since_last_action > self.fps: 321 | dt = self.clock.tick() 322 | self.time_elapsed_since_last_action += dt 323 | 324 | self.global_time += 1 325 | 326 | self.screen.fill(BACKGROUND) 327 | self.handle_events_human() 328 | 329 | current_reward = 0.1 330 | 331 | if self.global_time % self.dist_to_pipe == 0: 332 | self.pipes.append(Pipe( 333 | self.screen, WIDTH, HEIGHT, PIPE_COLOR, self.dist_between_pipes, self.pipe_image_up, self.pipe_image_down)) 334 | 335 | for pipe in self.pipes: 336 | pipe.update() 337 | 338 | if pipe.off_screen(): 339 | self.pipes.remove(pipe) 340 | 341 | if pipe.hits(self.bird): 342 | self.game_over() 343 | current_reward = -1 344 | 345 | if pipe.behind(self.bird): 346 | self.reward += 1 347 | current_reward = 1 348 | 349 | self.bird.handle_events_human() 350 | if self.bird.update(): 351 | self.game_over() 352 | current_reward = -1 353 | 354 | if self.draw: 355 | 356 | self.screen.blit(self.bg, (0, 0)) 357 | for pipe in self.pipes: 358 | pipe.draw() 359 | self.bird.draw(self.reward) 360 | text = pygame.font.SysFont(FONT, 28).render( 361 | "SCORE {}".format(self.reward), True, (0, 0, 0)) 362 | self.screen.blit(text, (565 - text.get_width() // 363 | 2, 30 - text.get_height() // 2)) 364 | pygame.display.flip() 365 | 366 | # if self.draw: 367 | # self.screen.fill(BACKGROUND) 368 | # for pipe in self.pipes: 369 | # pipe.draw() 370 | # self.bird.draw(self.reward) 371 | 372 | obs = self.get_observation_space() 373 | 374 | # if self.draw: 375 | # pygame.display.update() 376 | 377 | self.time_elapsed_since_last_action = 0 378 | # print(current_reward) 379 | 380 | def handle_events_human(self): 381 | for event in pygame.event.get(): 382 | if event.type == pygame.QUIT: 383 | self.is_done = False 384 | pygame.quit() 385 | -------------------------------------------------------------------------------- /flappyb/environment/pipe.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | import numpy as np 3 | 4 | RANDOM_PIPES = [150, 250, 350, 450, 550] 5 | 6 | 7 | class Pipe: 8 | 9 | def __init__(self, screen, s_width, s_height, color, dist_between_pipes=220,pipe_image_up=None, pipe_image_down=None): 10 | 11 | self.pipe_image_up = pipe_image_up # 52x808 12 | self.pipe_image_down = pipe_image_down # 52x808 13 | 14 | self.screen = screen 15 | self.s_width = s_width 16 | self.s_height = s_height 17 | self.color = color 18 | 19 | self.top = np.random.choice(RANDOM_PIPES) 20 | self.bot = self.top + dist_between_pipes 21 | 22 | # self.top = random.randrange(120, s_height-370) 23 | # self.bot = self.top + 350 24 | 25 | self.width = 52 26 | self.speed = 3 27 | self.x = s_width 28 | self.within_pipe = False 29 | 30 | def draw(self): 31 | # rect_top = pygame.rect.Rect(self.x, 0, self.width, self.top) 32 | # rect_bot = pygame.rect.Rect(self.x, self.bot, self.width, self.s_height) 33 | # pygame.draw.rect(self.screen, self.color, rect_top) 34 | # pygame.draw.rect(self.screen, self.color, rect_bot) 35 | 36 | if self.top > 320: 37 | pipe_rotated = pygame.transform.rotate(self.pipe_image_up, 180) 38 | self.screen.blit(pipe_rotated, (self.x, self.top - 320)) 39 | pipe_rotated_long = pygame.transform.rotate( 40 | self.pipe_image_down, 180) 41 | self.screen.blit(pipe_rotated_long, (self.x, self.top - 320 - 280)) 42 | else: 43 | pipe_rotated = pygame.transform.rotate(self.pipe_image_up, 180) 44 | self.screen.blit(pipe_rotated, (self.x, self.top - 320)) 45 | 46 | if self.s_height - self.bot > 320: 47 | self.screen.blit(self.pipe_image_up, (self.x, self.bot)) 48 | self.screen.blit(self.pipe_image_down, (self.x, self.bot + 280)) 49 | else: 50 | self.screen.blit(self.pipe_image_up, (self.x, self.bot)) 51 | 52 | def update(self): 53 | self.x -= self.speed 54 | 55 | def hits(self, bird): 56 | if bird.y < self.top or bird.y > self.bot: 57 | if self.x < bird.x < self.x + self.width: 58 | return True 59 | 60 | def behind(self, bird): 61 | if bird.x > self.x + self.width and not self.within_pipe: 62 | self.within_pipe = True 63 | return True 64 | if bird.x < self.x + self.width: 65 | self.within_pipe = False 66 | 67 | def off_screen(self): 68 | return self.x + self.width + 5 < 0 69 | -------------------------------------------------------------------------------- /flappyb/lib/common.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | HYPERPARAMS = { 9 | 'flappyb': { 10 | 'stop_reward': 100.0, 11 | 'run_name': 'flappyb', 12 | 'replay_size': 100000, 13 | 'replay_initial': 10000, 14 | 'target_net_sync': 1000, 15 | 'epsilon_frames': 10**5, 16 | 'epsilon_start': 1.0, 17 | 'epsilon_final': 0.02, 18 | 'learning_rate': 0.001, 19 | 'gamma': 0.9, 20 | 'batch_size': 32 21 | } 22 | } 23 | 24 | 25 | def unpack_batch(batch): 26 | states, actions, rewards, dones, last_states = [], [], [], [], [] 27 | for exp in batch: 28 | state = np.array(exp.state, copy=False) 29 | states.append(state) 30 | actions.append(exp.action) 31 | rewards.append(exp.reward) 32 | dones.append(exp.last_state is None) 33 | if exp.last_state is None: 34 | last_states.append(state) # the result will be masked anyway 35 | else: 36 | last_states.append(np.array(exp.last_state, copy=False)) 37 | return np.array(states, copy=False), np.array(actions), np.array(rewards, dtype=np.float32), \ 38 | np.array(dones, dtype=np.uint8), np.array(last_states, copy=False) 39 | 40 | 41 | def calc_loss_dqn(batch, net, tgt_net, gamma, device="cpu"): 42 | states, actions, rewards, dones, next_states = unpack_batch(batch) 43 | 44 | states_v = torch.tensor(states).to(device) 45 | next_states_v = torch.tensor(next_states).to(device) 46 | actions_v = torch.tensor(actions).to(device) 47 | rewards_v = torch.tensor(rewards).to(device) 48 | done_mask = torch.ByteTensor(dones).to(device) 49 | 50 | state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) 51 | next_state_values = tgt_net(next_states_v).max(1)[0] 52 | next_state_values[done_mask] = 0.0 53 | 54 | expected_state_action_values = next_state_values.detach() * gamma + rewards_v 55 | return nn.MSELoss()(state_action_values, expected_state_action_values) 56 | 57 | 58 | class RewardTracker: 59 | def __init__(self, name, net, writer, stop_reward): 60 | self.writer = writer 61 | self.stop_reward = stop_reward 62 | self.net = net 63 | self.name = name 64 | self.best_reward = -1 65 | 66 | def __enter__(self): 67 | self.ts = time.time() 68 | self.ts_frame = 0 69 | self.total_rewards = [] 70 | return self 71 | 72 | def __exit__(self, *args): 73 | if self.writer != None: 74 | self.writer.close() 75 | 76 | def reward(self, reward, frame, epsilon=None): 77 | self.total_rewards.append(reward) 78 | speed = (frame - self.ts_frame) / (time.time() - self.ts) 79 | self.ts_frame = frame 80 | self.ts = time.time() 81 | mean_reward = np.mean(self.total_rewards[-100:]) 82 | epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon 83 | print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % ( 84 | frame, len(self.total_rewards), mean_reward, speed, epsilon_str 85 | )) 86 | sys.stdout.flush() 87 | if self.writer != None: 88 | if epsilon is not None: 89 | self.writer.add_scalar("epsilon", epsilon, frame) 90 | self.writer.add_scalar("speed", speed, frame) 91 | # self.writer.add_scalar("reward_100", mean_reward, frame) 92 | self.writer.add_scalar("reward", reward, frame) 93 | if reward > self.best_reward: 94 | self.best_reward = reward 95 | torch.save(self.net.state_dict(), 'models/' + self.name + str(reward)) 96 | print("\tNew best reward = ", str(reward)) 97 | if mean_reward > self.stop_reward: 98 | print("Solved in %d frames!" % frame) 99 | return True 100 | return False 101 | 102 | 103 | class EpsilonTracker: 104 | def __init__(self, epsilon_greedy_selector, params): 105 | self.epsilon_greedy_selector = epsilon_greedy_selector 106 | self.epsilon_start = params['epsilon_start'] 107 | self.epsilon_final = params['epsilon_final'] 108 | self.epsilon_frames = params['epsilon_frames'] 109 | self.frame(0) 110 | 111 | def frame(self, frame): 112 | self.epsilon_greedy_selector.epsilon = \ 113 | max(self.epsilon_final, self.epsilon_start - frame / self.epsilon_frames) 114 | 115 | 116 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma): 117 | """ 118 | Perform distribution projection aka Catergorical Algorithm from the 119 | "A Distributional Perspective on RL" paper 120 | """ 121 | batch_size = len(rewards) 122 | proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32) 123 | delta_z = (Vmax - Vmin) / (n_atoms - 1) 124 | for atom in range(n_atoms): 125 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma)) 126 | b_j = (tz_j - Vmin) / delta_z 127 | l = np.floor(b_j).astype(np.int64) 128 | u = np.ceil(b_j).astype(np.int64) 129 | eq_mask = u == l 130 | proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom] 131 | ne_mask = u != l 132 | proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask] 133 | proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask] 134 | if dones.any(): 135 | proj_distr[dones] = 0.0 136 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones])) 137 | b_j = (tz_j - Vmin) / delta_z 138 | l = np.floor(b_j).astype(np.int64) 139 | u = np.ceil(b_j).astype(np.int64) 140 | eq_mask = u == l 141 | eq_dones = dones.copy() 142 | eq_dones[dones] = eq_mask 143 | if eq_dones.any(): 144 | proj_distr[eq_dones, l[eq_mask]] = 1.0 145 | ne_mask = u != l 146 | ne_dones = dones.copy() 147 | ne_dones[dones] = ne_mask 148 | if ne_dones.any(): 149 | proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask] 150 | proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask] 151 | return proj_distr -------------------------------------------------------------------------------- /flappyb/lib/dqn_model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | import numpy as np 7 | 8 | 9 | class NoisyLinear(nn.Linear): 10 | def __init__(self, in_features, out_features, sigma_init=0.017, bias=True): 11 | super(NoisyLinear, self).__init__(in_features, out_features, bias=bias) 12 | self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) 13 | self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features)) 14 | if bias: 15 | self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init)) 16 | self.register_buffer("epsilon_bias", torch.zeros(out_features)) 17 | self.reset_parameters() 18 | 19 | def reset_parameters(self): 20 | std = math.sqrt(3 / self.in_features) 21 | self.weight.data.uniform_(-std, std) 22 | self.bias.data.uniform_(-std, std) 23 | 24 | def forward(self, input): 25 | self.epsilon_weight.normal_() 26 | bias = self.bias 27 | if bias is not None: 28 | self.epsilon_bias.normal_() 29 | bias = bias + self.sigma_bias * self.epsilon_bias.data 30 | return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias) 31 | 32 | 33 | class NoisyFactorizedLinear(nn.Linear): 34 | """ 35 | NoisyNet layer with factorized gaussian noise 36 | 37 | N.B. nn.Linear already initializes weight and bias to 38 | """ 39 | def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True): 40 | super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias) 41 | sigma_init = sigma_zero / math.sqrt(in_features) 42 | self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) 43 | self.register_buffer("epsilon_input", torch.zeros(1, in_features)) 44 | self.register_buffer("epsilon_output", torch.zeros(out_features, 1)) 45 | if bias: 46 | self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init)) 47 | 48 | def forward(self, input): 49 | self.epsilon_input.normal_() 50 | self.epsilon_output.normal_() 51 | 52 | func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x)) 53 | eps_in = func(self.epsilon_input.data) 54 | eps_out = func(self.epsilon_output.data) 55 | 56 | bias = self.bias 57 | if bias is not None: 58 | bias = bias + self.sigma_bias * eps_out.t() 59 | noise_v = torch.mul(eps_in, eps_out) 60 | return F.linear(input, self.weight + self.sigma_weight * noise_v, bias) 61 | 62 | 63 | class DQN(nn.Module): 64 | def __init__(self, input_shape, n_actions): 65 | super(DQN, self).__init__() 66 | 67 | self.fc = nn.Sequential( 68 | nn.Linear(input_shape, 512), 69 | nn.ReLU(), 70 | nn.Linear(512, n_actions) 71 | ) 72 | 73 | def forward(self, x): 74 | fx = x.float() / 256 75 | return self.fc(fx) -------------------------------------------------------------------------------- /flappyb/lib/dqn_rainbow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from environment.environment import Environment 3 | import ptan 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | import torch.optim as optim 12 | 13 | from tensorboardX import SummaryWriter 14 | 15 | import dqn as dqn_model 16 | import common 17 | 18 | # n-step 19 | REWARD_STEPS = 2 20 | 21 | # priority replay 22 | PRIO_REPLAY_ALPHA = 0.6 23 | BETA_START = 0.4 24 | BETA_FRAMES = 100000 25 | 26 | # C51 27 | Vmax = 10 28 | Vmin = -10 29 | N_ATOMS = 51 30 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 31 | 32 | 33 | class RainbowDQN(nn.Module): 34 | def __init__(self, input_shape, n_actions): 35 | super(RainbowDQN, self).__init__() 36 | 37 | self.fc_val = nn.Sequential( 38 | dqn_model.NoisyLinear(input_shape[0], 256), 39 | nn.ReLU(), 40 | dqn_model.NoisyLinear(256, N_ATOMS) 41 | ) 42 | 43 | self.fc_adv = nn.Sequential( 44 | dqn_model.NoisyLinear(input_shape[0], 256), 45 | nn.ReLU(), 46 | dqn_model.NoisyLinear(256, n_actions * N_ATOMS) 47 | ) 48 | 49 | self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)) 50 | self.softmax = nn.Softmax(dim=1) 51 | 52 | def forward(self, x): 53 | batch_size = x.size()[0] 54 | fx = x.float() / 256 55 | val_out = self.fc_val(fx).view(batch_size, 1, N_ATOMS) 56 | adv_out = self.fc_adv(fx).view(batch_size, -1, N_ATOMS) 57 | adv_mean = adv_out.mean(dim=1, keepdim=True) 58 | return val_out + (adv_out - adv_mean) 59 | 60 | def both(self, x): 61 | cat_out = self(x) 62 | probs = self.apply_softmax(cat_out) 63 | weights = probs * self.supports 64 | res = weights.sum(dim=2) 65 | return cat_out, res 66 | 67 | def qvals(self, x): 68 | return self.both(x)[1] 69 | 70 | def apply_softmax(self, t): 71 | return self.softmax(t.view(-1, N_ATOMS)).view(t.size()) 72 | 73 | 74 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): 75 | states, actions, rewards, dones, next_states = common.unpack_batch(batch) 76 | batch_size = len(batch) 77 | 78 | states_v = torch.tensor(states).to(device) 79 | actions_v = torch.tensor(actions).to(device) 80 | next_states_v = torch.tensor(next_states).to(device) 81 | batch_weights_v = torch.tensor(batch_weights).to(device) 82 | 83 | # next state distribution 84 | # dueling arch -- actions from main net, distr from tgt_net 85 | 86 | # calc at once both next and cur states 87 | distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) 88 | next_qvals_v = qvals_v[batch_size:] 89 | distr_v = distr_v[:batch_size] 90 | 91 | next_actions_v = next_qvals_v.max(1)[1] 92 | next_distr_v = tgt_net(next_states_v) 93 | next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] 94 | next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) 95 | next_best_distr = next_best_distr_v.data.cpu().numpy() 96 | 97 | dones = dones.astype(np.bool) 98 | 99 | # project our distribution using Bellman update 100 | proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) 101 | 102 | # calculate net output 103 | state_action_values = distr_v[range(batch_size), actions_v.data] 104 | state_log_sm_v = F.log_softmax(state_action_values, dim=1) 105 | proj_distr_v = torch.tensor(proj_distr).to(device) 106 | 107 | loss_v = -state_log_sm_v * proj_distr_v 108 | loss_v = batch_weights_v * loss_v.sum(dim=1) 109 | return loss_v.mean(), loss_v + 1e-5 110 | 111 | 112 | if __name__ == "__main__": 113 | params = common.HYPERPARAMS['pong'] 114 | params['epsilon_frames'] *= 2 115 | parser = argparse.ArgumentParser() 116 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 117 | args = parser.parse_args() 118 | device = torch.device("cuda" if args.cuda else "cpu") 119 | 120 | env = gym.make(params['env_name']) 121 | env = ptan.common.wrappers.wrap_dqn(env) 122 | 123 | writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow") 124 | net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device) 125 | tgt_net = ptan.agent.TargetNet(net) 126 | agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device) 127 | 128 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS) 129 | buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) 130 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 131 | 132 | frame_idx = 0 133 | beta = BETA_START 134 | 135 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 136 | while True: 137 | frame_idx += 1 138 | buffer.populate(1) 139 | beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) 140 | 141 | new_rewards = exp_source.pop_total_rewards() 142 | if new_rewards: 143 | if reward_tracker.reward(new_rewards[0], frame_idx): 144 | break 145 | 146 | if len(buffer) < params['replay_initial']: 147 | continue 148 | 149 | optimizer.zero_grad() 150 | batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta) 151 | loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model, 152 | params['gamma'] ** REWARD_STEPS, device=device) 153 | loss_v.backward() 154 | optimizer.step() 155 | buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy()) 156 | 157 | if frame_idx % params['target_net_sync'] == 0: 158 | tgt_net.sync() 159 | -------------------------------------------------------------------------------- /flappyb/lib/ppo_model.py: -------------------------------------------------------------------------------- 1 | import ptan 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | 6 | HID_SIZE = 64 7 | 8 | 9 | class ModelActor(nn.Module): 10 | def __init__(self, obs_size, act_size): 11 | super(ModelActor, self).__init__() 12 | 13 | self.mu = nn.Sequential( 14 | nn.Linear(obs_size, HID_SIZE), 15 | nn.Tanh(), 16 | nn.Linear(HID_SIZE, HID_SIZE), 17 | nn.Tanh(), 18 | nn.Linear(HID_SIZE, act_size), 19 | nn.Tanh(), 20 | ) 21 | self.logstd = nn.Parameter(torch.zeros(act_size)) 22 | 23 | def forward(self, x): 24 | return self.mu(x) 25 | 26 | 27 | class ModelCritic(nn.Module): 28 | def __init__(self, obs_size): 29 | super(ModelCritic, self).__init__() 30 | 31 | self.value = nn.Sequential( 32 | nn.Linear(obs_size, HID_SIZE), 33 | nn.ReLU(), 34 | nn.Linear(HID_SIZE, HID_SIZE), 35 | nn.ReLU(), 36 | nn.Linear(HID_SIZE, 1), 37 | ) 38 | 39 | def forward(self, x): 40 | return self.value(x) 41 | 42 | 43 | class AgentA2C(ptan.agent.BaseAgent): 44 | def __init__(self, net, device="cpu"): 45 | self.net = net 46 | self.device = device 47 | 48 | def __call__(self, states, agent_states): 49 | states_v = ptan.agent.float32_preprocessor(states).to(self.device) 50 | 51 | mu_v = self.net(states_v) 52 | mu = mu_v.data.cpu().numpy() 53 | logstd = self.net.logstd.data.cpu().numpy() 54 | actions = mu + np.exp(logstd) * np.random.normal(size=logstd.shape) 55 | actions = np.clip(actions, -1, 1) 56 | return actions, agent_states 57 | -------------------------------------------------------------------------------- /flappyb/models/cross_entropy/batchsize=100-hiddensize=256-lr=0.01-gamma=.9-PART=240.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/cross_entropy/batchsize=100-hiddensize=256-lr=0.01-gamma=.9-PART=240.pt -------------------------------------------------------------------------------- /flappyb/models/dqn/dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-LOADED=HARDCORE-6300-lrMax=0.4-nextPipe-HELL-PART=1000.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/dqn/dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-LOADED=HARDCORE-6300-lrMax=0.4-nextPipe-HELL-PART=1000.h5 -------------------------------------------------------------------------------- /flappyb/models/dqn/dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=6650.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/dqn/dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=6650.h5 -------------------------------------------------------------------------------- /flappyb/models/dqn/dqn-expdecay=0.999995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-HARDCORE-PART=6300.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/dqn/dqn-expdecay=0.999995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-HARDCORE-PART=6300.h5 -------------------------------------------------------------------------------- /flappyb/models/flappyb-test-the-rainbow254: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/flappyb-test-the-rainbow254 -------------------------------------------------------------------------------- /flappyb/models/flappyb-test-the-rainbow350: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/flappyb-test-the-rainbow350 -------------------------------------------------------------------------------- /flappyb/models/flappyb-test-the-rainbow87: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/flappyb-test-the-rainbow87 -------------------------------------------------------------------------------- /flappyb/play_dqn_rainbow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from environment.environment import Environment 3 | import time 4 | import numpy as np 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | import collections 10 | 11 | import ptan 12 | from lib import dqn_model 13 | 14 | 15 | MODEL_NAME = "flappyb-test-the-rainbow350" 16 | Vmax = 10 17 | Vmin = -10 18 | N_ATOMS = 51 19 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 20 | NUMBER_NEURONS = 512 21 | 22 | 23 | class RainbowDQN(nn.Module): 24 | def __init__(self, input_shape, n_actions): 25 | super(RainbowDQN, self).__init__() 26 | 27 | self.fc_val = nn.Sequential( 28 | dqn_model.NoisyLinear(input_shape, NUMBER_NEURONS), 29 | nn.ReLU(), 30 | dqn_model.NoisyLinear(NUMBER_NEURONS, N_ATOMS) 31 | ) 32 | 33 | self.fc_adv = nn.Sequential( 34 | dqn_model.NoisyLinear(input_shape, NUMBER_NEURONS), 35 | nn.ReLU(), 36 | dqn_model.NoisyLinear(NUMBER_NEURONS, n_actions * N_ATOMS) 37 | ) 38 | 39 | self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)) 40 | self.softmax = nn.Softmax(dim=1) 41 | 42 | def forward(self, x): 43 | batch_size = x.size()[0] 44 | fx = x.float() / NUMBER_NEURONS 45 | val_out = self.fc_val(fx).view(batch_size, 1, N_ATOMS) 46 | adv_out = self.fc_adv(fx).view(batch_size, -1, N_ATOMS) 47 | adv_mean = adv_out.mean(dim=1, keepdim=True) 48 | return val_out + (adv_out - adv_mean) 49 | 50 | def both(self, x): 51 | cat_out = self(x) 52 | probs = self.apply_softmax(cat_out) 53 | weights = probs * self.supports 54 | res = weights.sum(dim=2) 55 | return cat_out, res 56 | 57 | def qvals(self, x): 58 | return self.both(x)[1] 59 | 60 | def apply_softmax(self, t): 61 | return self.softmax(t.view(-1, N_ATOMS)).view(t.size()) 62 | 63 | 64 | if __name__ == "__main__": 65 | 66 | env = Environment(draw=True, fps=1, debug=True, 67 | dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True) 68 | 69 | net = RainbowDQN(env.observation_space.n, env.action_space.n) 70 | net.load_state_dict(torch.load("models/" + MODEL_NAME, map_location=lambda storage, loc: storage)) 71 | 72 | agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector()) 73 | 74 | for i in range(10): 75 | state = env.reset() 76 | total_reward = 0.0 77 | c = collections.Counter() 78 | 79 | while True: 80 | start_ts = time.time() 81 | state_v = torch.tensor(np.array([state], copy=False)) 82 | q_vals = agent(state_v) #.data.numpy()[0] 83 | action = q_vals[0][0] 84 | print(action) 85 | 86 | c[action] += 1 87 | state, reward, done, _ = env.step(action) 88 | total_reward += reward 89 | if done: 90 | break 91 | print("Total reward: %.2f" % total_reward) 92 | print("Action counts:", c) 93 | -------------------------------------------------------------------------------- /flappyb/play_ppo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import ptan 4 | 5 | from lib import ppo_model as model 6 | from environment.environment import Environment 7 | 8 | # MODEL_NAME = "flappyb-test-the-rainbow350" 9 | MODEL_NAME = "best_+131.310_576000.dat" 10 | 11 | env = env = Environment(draw=True, fps=10, debug=False, 12 | dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True) 13 | 14 | net_act = model.ModelActor(env.observation_space.n, 15 | env.action_space.n).to("cpu") 16 | net_act.load_state_dict(torch.load("saves/ppo-test-flappyb/" + MODEL_NAME, map_location=lambda storage, loc: storage)) 17 | 18 | rewards = 0.0 19 | steps = 0 20 | for _ in range(5): 21 | obs = env.reset() 22 | while True: 23 | obs_v = ptan.agent.float32_preprocessor([obs]).to("cpu") 24 | mu_v = net_act(obs_v)[0] 25 | action = mu_v.squeeze(dim=0).data.cpu().numpy() 26 | action = np.clip(action, -1, 1) 27 | obs, reward, done, _ = env.step(action) 28 | rewards += reward 29 | steps += 1 30 | if done: 31 | break 32 | -------------------------------------------------------------------------------- /flappyb/play_self.py: -------------------------------------------------------------------------------- 1 | from environment.environment import Environment 2 | 3 | 4 | class Agent: 5 | 6 | def __init__(self): 7 | self.total_reward = 0 8 | 9 | def step(self, env): 10 | env.get_observation_space() 11 | action = env.get_action_random() 12 | obs, reward, is_done, _ = env.step(action) 13 | self.total_reward += reward 14 | 15 | 16 | # HUMAN PLAYS 17 | env = Environment(draw=True, fps=20, debug=True, dist_to_pipe=40, 18 | dist_between_pipes=150, obs_this_pipe=True) 19 | env.run_human_game() 20 | 21 | 22 | # RANDOM AGENT 23 | # agent = Agent() 24 | # env = Environment(True, 10) 25 | 26 | # for i in range(10): 27 | # env.reset() 28 | # while not env.is_done: 29 | # agent.step(env) 30 | 31 | # print("Total reward = {}".format(agent.total_reward)) 32 | -------------------------------------------------------------------------------- /flappyb/ppo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import math 4 | import ptan 5 | import time 6 | import gym 7 | import argparse 8 | from tensorboardX import SummaryWriter 9 | 10 | from lib import ppo_model as model 11 | from environment.environment import Environment 12 | 13 | import numpy as np 14 | import torch 15 | import torch.optim as optim 16 | import torch.nn.functional as F 17 | 18 | 19 | GAMMA = 0.99 20 | GAE_LAMBDA = 0.95 21 | 22 | TRAJECTORY_SIZE = 2049 23 | LEARNING_RATE_ACTOR = 1e-4 24 | LEARNING_RATE_CRITIC = 1e-3 25 | 26 | PPO_EPS = 0.2 27 | PPO_EPOCHES = 10 28 | PPO_BATCH_SIZE = 64 29 | 30 | TEST_ITERS = 1000 31 | 32 | 33 | def test_net(net, env, count=10, device="cpu"): 34 | rewards = 0.0 35 | steps = 0 36 | for _ in range(count): 37 | obs = env.reset() 38 | while True: 39 | obs_v = ptan.agent.float32_preprocessor([obs]).to(device) 40 | mu_v = net(obs_v)[0] 41 | action = mu_v.squeeze(dim=0).data.cpu().numpy() 42 | action = np.clip(action, -1, 1) 43 | obs, reward, done, _ = env.step(action) 44 | rewards += reward 45 | steps += 1 46 | if done: 47 | break 48 | return rewards / count, steps / count 49 | 50 | 51 | def calc_logprob(mu_v, logstd_v, actions_v): 52 | p1 = - ((mu_v - actions_v) ** 2) / (2*torch.exp(logstd_v).clamp(min=1e-3)) 53 | p2 = - torch.log(torch.sqrt(2 * math.pi * torch.exp(logstd_v))) 54 | return p1 + p2 55 | 56 | 57 | def calc_adv_ref(trajectory, net_crt, states_v, device="cpu"): 58 | """ 59 | By trajectory calculate advantage and 1-step ref value 60 | :param trajectory: trajectory list 61 | :param net_crt: critic network 62 | :param states_v: states tensor 63 | :return: tuple with advantage numpy array and reference values 64 | """ 65 | values_v = net_crt(states_v) 66 | values = values_v.squeeze().data.cpu().numpy() 67 | # generalized advantage estimator: smoothed version of the advantage 68 | last_gae = 0.0 69 | result_adv = [] 70 | result_ref = [] 71 | for val, next_val, (exp,) in zip(reversed(values[:-1]), reversed(values[1:]), 72 | reversed(trajectory[:-1])): 73 | if exp.done: 74 | delta = exp.reward - val 75 | last_gae = delta 76 | else: 77 | delta = exp.reward + GAMMA * next_val - val 78 | last_gae = delta + GAMMA * GAE_LAMBDA * last_gae 79 | result_adv.append(last_gae) 80 | result_ref.append(last_gae + val) 81 | 82 | adv_v = torch.FloatTensor(list(reversed(result_adv))).to(device) 83 | ref_v = torch.FloatTensor(list(reversed(result_ref))).to(device) 84 | return adv_v, ref_v 85 | 86 | 87 | if __name__ == "__main__": 88 | # parser = argparse.ArgumentParser() 89 | # parser.add_argument("--cuda", default=False, action='store_true', help='Enable CUDA') 90 | # parser.add_argument("-n", "--name", required=True, help="Name of the run") 91 | # parser.add_argument("-e", "--env", default=ENV_ID, help="Environment id, default=" + ENV_ID) 92 | # args = parser.parse_args() 93 | 94 | name = "test-flappyb" 95 | 96 | # device = torch.device("cuda" if args.cuda else "cpu") 97 | device = torch.device("cpu") 98 | 99 | save_path = os.path.join("saves", "ppo-" + name) 100 | os.makedirs(save_path, exist_ok=True) 101 | 102 | env = env = Environment(draw=False, fps=1, debug=False, 103 | dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True) 104 | test_env = env = Environment(draw=False, fps=1, debug=False, 105 | dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True) 106 | 107 | net_act = model.ModelActor(env.observation_space.n, env.action_space.n).to(device) 108 | net_crt = model.ModelCritic(env.observation_space.n).to(device) 109 | print(net_act) 110 | print(net_crt) 111 | 112 | writer = SummaryWriter(comment="-ppo_" + name) 113 | agent = model.AgentA2C(net_act, device=device) 114 | exp_source = ptan.experience.ExperienceSource(env, agent, steps_count=1) 115 | 116 | opt_act = optim.Adam(net_act.parameters(), lr=LEARNING_RATE_ACTOR) 117 | opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC) 118 | 119 | trajectory = [] 120 | best_reward = None 121 | with ptan.common.utils.RewardTracker(writer) as tracker: 122 | for step_idx, exp in enumerate(exp_source): 123 | rewards_steps = exp_source.pop_rewards_steps() 124 | if rewards_steps: 125 | rewards, steps = zip(*rewards_steps) 126 | writer.add_scalar("episode_steps", np.mean(steps), step_idx) 127 | tracker.reward(np.mean(rewards), step_idx) 128 | 129 | if step_idx % TEST_ITERS == 0: 130 | ts = time.time() 131 | rewards, steps = test_net(net_act, test_env, device=device) 132 | print("Test done in %.2f sec, reward %.3f, steps %d" % ( 133 | time.time() - ts, rewards, steps)) 134 | writer.add_scalar("test_reward", rewards, step_idx) 135 | writer.add_scalar("test_steps", steps, step_idx) 136 | if best_reward is None or best_reward < rewards: 137 | if best_reward is not None: 138 | print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) 139 | name = "best_%+.3f_%d.dat" % (rewards, step_idx) 140 | fname = os.path.join(save_path, name) 141 | torch.save(net_act.state_dict(), fname) 142 | best_reward = rewards 143 | 144 | trajectory.append(exp) 145 | if len(trajectory) < TRAJECTORY_SIZE: 146 | continue 147 | 148 | traj_states = [t[0].state for t in trajectory] 149 | traj_actions = [t[0].action for t in trajectory] 150 | traj_states_v = torch.FloatTensor(traj_states).to(device) 151 | traj_actions_v = torch.FloatTensor(traj_actions).to(device) 152 | traj_adv_v, traj_ref_v = calc_adv_ref(trajectory, net_crt, traj_states_v, device=device) 153 | mu_v = net_act(traj_states_v) 154 | old_logprob_v = calc_logprob(mu_v, net_act.logstd, traj_actions_v) 155 | 156 | # normalize advantages 157 | traj_adv_v = (traj_adv_v - torch.mean(traj_adv_v)) / torch.std(traj_adv_v) 158 | 159 | # drop last entry from the trajectory, an our adv and ref value calculated without it 160 | trajectory = trajectory[:-1] 161 | old_logprob_v = old_logprob_v[:-1].detach() 162 | 163 | sum_loss_value = 0.0 164 | sum_loss_policy = 0.0 165 | count_steps = 0 166 | 167 | for epoch in range(PPO_EPOCHES): 168 | for batch_ofs in range(0, len(trajectory), PPO_BATCH_SIZE): 169 | states_v = traj_states_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE] 170 | actions_v = traj_actions_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE] 171 | batch_adv_v = traj_adv_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE].unsqueeze(-1) 172 | batch_ref_v = traj_ref_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE] 173 | batch_old_logprob_v = old_logprob_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE] 174 | 175 | # critic training 176 | opt_crt.zero_grad() 177 | value_v = net_crt(states_v) 178 | loss_value_v = F.mse_loss(value_v.squeeze(-1), batch_ref_v) 179 | loss_value_v.backward() 180 | opt_crt.step() 181 | 182 | # actor training 183 | opt_act.zero_grad() 184 | mu_v = net_act(states_v) 185 | logprob_pi_v = calc_logprob(mu_v, net_act.logstd, actions_v) 186 | ratio_v = torch.exp(logprob_pi_v - batch_old_logprob_v) 187 | surr_obj_v = batch_adv_v * ratio_v 188 | clipped_surr_v = batch_adv_v * torch.clamp(ratio_v, 1.0 - PPO_EPS, 1.0 + PPO_EPS) 189 | loss_policy_v = -torch.min(surr_obj_v, clipped_surr_v).mean() 190 | loss_policy_v.backward() 191 | opt_act.step() 192 | 193 | sum_loss_value += loss_value_v.item() 194 | sum_loss_policy += loss_policy_v.item() 195 | count_steps += 1 196 | 197 | trajectory.clear() 198 | writer.add_scalar("advantage", traj_adv_v.mean().item(), step_idx) 199 | writer.add_scalar("values", traj_ref_v.mean().item(), step_idx) 200 | writer.add_scalar("loss_policy", sum_loss_policy / count_steps, step_idx) 201 | writer.add_scalar("loss_value", sum_loss_value / count_steps, step_idx) 202 | -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+10.400_555000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+10.400_555000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+11.270_556000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+11.270_556000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+131.310_576000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+131.310_576000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+20.470_558000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+20.470_558000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+4.650_165000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+4.650_165000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+4.860_370000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+4.860_370000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+44.070_560000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+44.070_560000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+44.560_561000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+44.560_561000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+5.290_475000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+5.290_475000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+5.530_495000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+5.530_495000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+5.740_516000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+5.740_516000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+5.820_538000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+5.820_538000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+56.790_570000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+56.790_570000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+6.250_539000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+6.250_539000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+6.820_542000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+6.820_542000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+7.200_547000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+7.200_547000.dat -------------------------------------------------------------------------------- /flappyb/saves/ppo-test-flappyb/best_+8.690_550000.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+8.690_550000.dat -------------------------------------------------------------------------------- /old_agents/cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Nice entry, but cross entropy advanced is a whole lot better 2 | 3 | import gym 4 | from collections import namedtuple 5 | import numpy as np 6 | from tensorboardX import SummaryWriter 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | 12 | from flappyb.environment import Environment 13 | 14 | NAME = 'cross-entropy-batchsize:16-hiddensize1:128' 15 | WRITE = False 16 | DRAW = True 17 | 18 | HIDDEN_SIZE = 128 # num of neurons in hidden layer 19 | BATCH_SIZE = 16 # number of episodes in a batch 20 | PERCENTILE_THROW_AWAY = 70 # percentage of episodes in batch to not learn from 21 | 22 | 23 | class Net(nn.Module): 24 | def __init__(self, obs_size, hidden_size, n_actions): 25 | super(Net, self).__init__() 26 | self.net = nn.Sequential( 27 | nn.Linear(obs_size, hidden_size), 28 | nn.ReLU(), 29 | nn.Linear(hidden_size, n_actions) 30 | ) 31 | # output is a probability distribution 32 | def forward(self, x): # ... over the actions 33 | return self.net(x) 34 | 35 | 36 | # helpers to represent single steps and episodes from the actor 37 | Episode = namedtuple('Episode', field_names=['reward', 'steps']) 38 | EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action']) 39 | 40 | 41 | def iterate_batches(env, net, batch_size): 42 | 43 | batch = [] 44 | episode_reward = 0.0 45 | episode_steps = [] 46 | obs = env.reset() 47 | sm = nn.Softmax(dim=1) # converts net output (raw action score) 48 | # ... to probability distribution 49 | while True: 50 | obs_v = torch.FloatTensor([obs]) # converts observation to tensor 51 | act_probs_v = sm(net(obs_v)) # then generate action probability policy 52 | act_probs = act_probs_v.data.numpy()[0] # convert tensor back to array 53 | 54 | # choose an action according to the available probability 55 | action = np.random.choice(len(act_probs), p=act_probs) 56 | next_obs, reward, is_done, _ = env.step(action) 57 | 58 | # use obs that we started whith in this episode 59 | episode_reward += reward 60 | episode_steps.append(EpisodeStep(observation=obs, action=action)) 61 | 62 | # when episode (one single game) ends 63 | if is_done: 64 | # remember episode steps and clear environment 65 | batch.append(Episode(reward=episode_reward, steps=episode_steps)) 66 | episode_reward = 0.0 67 | episode_steps = [] 68 | next_obs = env.reset() 69 | 70 | # when batch is complete (multiple episodes) pass it to the learning loop 71 | if len(batch) == batch_size: 72 | yield batch 73 | batch = [] 74 | 75 | obs = next_obs 76 | 77 | 78 | def filter_batch(batch, percentile): 79 | rewards = list(map(lambda s: s.reward, batch)) 80 | reward_bound = np.percentile(rewards, percentile) 81 | reward_mean = float(np.mean(rewards)) 82 | 83 | train_obs = [] 84 | train_act = [] 85 | 86 | for example in batch: 87 | if example.reward < reward_bound: 88 | continue # filters episodes 89 | train_obs.extend(map(lambda step: step.observation, example.steps)) 90 | train_act.extend(map(lambda step: step.action, example.steps)) 91 | 92 | train_obs_v = torch.FloatTensor(train_obs) 93 | train_act_v = torch.LongTensor(train_act) 94 | 95 | # return elite episodes as tensors 96 | return train_obs_v, train_act_v, reward_bound, reward_mean 97 | 98 | 99 | if __name__ == "__main__": 100 | 101 | env = Environment(DRAW) 102 | 103 | obs_size = env.get_observation_size() 104 | n_actions = env.get_action_size() 105 | 106 | net = Net(obs_size, HIDDEN_SIZE, n_actions) 107 | 108 | objective = nn.CrossEntropyLoss() # main function to teach net 109 | optimizer = optim.Adam(params=net.parameters(), lr=0.01) 110 | writer = None 111 | if WRITE: 112 | writer = SummaryWriter(comment=NAME) 113 | 114 | # actual training loop 115 | for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)): 116 | # filter bad episodes so only the best episodes of a batch remain 117 | obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE_THROW_AWAY) 118 | optimizer.zero_grad() 119 | 120 | action_scores_v = net(obs_v) # pass obs to network again and retreive score 121 | # calculate cross entropy between net output and actions 122 | # ... the agent took inorder to learn the good actions 123 | loss_v = objective(action_scores_v, acts_v) # calculate loss function 124 | loss_v.backward() # apply gradient descent (not sure if this statement is correct) 125 | optimizer.step() # optimize network 126 | 127 | print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (iter_no, loss_v.item(), reward_m, reward_b)) 128 | if WRITE: 129 | writer.add_scalar("reward_mean", reward_m, iter_no) 130 | if iter_no > 1000: 131 | print("500 steps should be sufficient") 132 | break 133 | if WRITE: 134 | writer.close() 135 | -------------------------------------------------------------------------------- /old_agents/cross_entropy_advanced.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple 3 | import numpy as np 4 | from tensorboardX import SummaryWriter 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | 10 | from flappyb.environment import Environment 11 | 12 | HIDDEN_SIZE_1 = 256 13 | BATCH_SIZE = 100 14 | PERCENTILE = 30 15 | LEARNING_RATE = 0.01 16 | GAMMA = .99 17 | 18 | NAME = 'batchsize=100-hiddensize=256-lr=0.01-gamma=.9' 19 | NAME = 'batchsize=100-hiddensize=256-lr=0.01-gamma=.99' 20 | WRITE = False 21 | DRAW = False 22 | SAVE_MODEL = False 23 | 24 | 25 | class Net(nn.Module): 26 | def __init__(self, obs_size, n_actions): 27 | super(Net, self).__init__() 28 | self.net = nn.Sequential( 29 | nn.Linear(obs_size, HIDDEN_SIZE_1), 30 | nn.ReLU(), 31 | nn.Linear(HIDDEN_SIZE_1, HIDDEN_SIZE_1), 32 | nn.ReLU(), 33 | nn.Linear(HIDDEN_SIZE_1, n_actions) 34 | ) 35 | 36 | def forward(self, x): 37 | return self.net(x) 38 | 39 | 40 | Episode = namedtuple('Episode', field_names=['reward', 'steps']) 41 | EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action']) 42 | 43 | 44 | def iterate_batches(env, net, batch_size): 45 | batch = [] 46 | episode_reward = 0.0 47 | episode_steps = [] 48 | obs = env.reset() 49 | sm = nn.Softmax(dim=1) 50 | while True: 51 | obs_v = torch.FloatTensor([obs]) 52 | act_probs_v = sm(net(obs_v)) 53 | act_probs = act_probs_v.data.numpy()[0] 54 | # action = np.random.choice(len(act_probs), p=act_probs) 55 | action = env.get_action_random() 56 | 57 | next_obs, reward, is_done, _ = env.step(action) 58 | episode_reward += reward 59 | episode_steps.append(EpisodeStep(observation=obs, action=action)) 60 | if is_done: 61 | batch.append(Episode(reward=episode_reward, steps=episode_steps)) 62 | episode_reward = 0.0 63 | episode_steps = [] 64 | next_obs = env.reset() 65 | if len(batch) == batch_size: 66 | yield batch 67 | batch = [] 68 | obs = next_obs 69 | 70 | 71 | def filter_batch(batch, percentile): 72 | disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch)) 73 | # disc_rewards = list(map(lambda s: s.reward * (len(s.steps)), batch)) 74 | reward_bound = np.percentile(disc_rewards, percentile) 75 | 76 | train_obs = [] 77 | train_act = [] 78 | elite_batch = [] 79 | for example, discounted_reward in zip(batch, disc_rewards): 80 | if discounted_reward > reward_bound: 81 | train_obs.extend(map(lambda step: step.observation, example.steps)) 82 | train_act.extend(map(lambda step: step.action, example.steps)) 83 | elite_batch.append(example) 84 | 85 | return elite_batch, train_obs, train_act, reward_bound 86 | 87 | 88 | if __name__ == "__main__": 89 | random.seed(12345) 90 | env = Environment(DRAW) # activate save 91 | 92 | obs_size = env.get_observation_size() 93 | n_actions = env.get_action_size() 94 | 95 | net = Net(obs_size, n_actions) 96 | net.load_state_dict(torch.load('models/cross_entropy/{}-PART=240.pt'.format(NAME))) 97 | net.eval() 98 | 99 | # torch.save(net.state_dict(), 'models/cross_entropy/{}-PART=0.pt'.format(NAME)) 100 | 101 | objective = nn.CrossEntropyLoss() 102 | optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE) 103 | 104 | writer = None 105 | if WRITE: 106 | writer = SummaryWriter(comment=NAME) 107 | 108 | full_batch = [] 109 | for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)): 110 | reward_mean = float(np.mean(list(map(lambda s: s.reward, batch)))) 111 | full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE) 112 | if not full_batch: 113 | continue 114 | obs_v = torch.FloatTensor(obs) 115 | acts_v = torch.LongTensor(acts) 116 | full_batch = full_batch[-500:] 117 | 118 | optimizer.zero_grad() 119 | action_scores_v = net(obs_v) 120 | loss_v = objective(action_scores_v, acts_v) 121 | loss_v.backward() 122 | optimizer.step() 123 | print("%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f, batch=%d" % ( 124 | iter_no, loss_v.item(), reward_mean, reward_bound, len(full_batch))) 125 | if WRITE: 126 | writer.add_scalar("reward", reward_mean, iter_no) 127 | if (iter_no % 30 == 0) and SAVE_MODEL : 128 | torch.save(net.state_dict(), 'models/cross_entropy/{}-PART={}.pt'.format(NAME, iter_no)) 129 | pass 130 | if iter_no > 10000: 131 | print("That should be enough!") 132 | break 133 | 134 | if WRITE: 135 | writer.close() 136 | -------------------------------------------------------------------------------- /old_agents/dqn_snake_v2.py: -------------------------------------------------------------------------------- 1 | # READ ME PEASE 2 | 3 | # https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288 4 | # LOSSS FUNCTIONS: https://medium.com/udacity-pytorch-challengers/a-brief-overview-of-loss-functions-in-pytorch-c0ddb78068f7 5 | # BEST OPTIMIZER: https://medium.com/octavian-ai/which-optimizer-and-learning-rate-should-i-use-for-deep-learning-5acb418f9b2 6 | 7 | import random 8 | import numpy as np 9 | from collections import deque 10 | 11 | import keras 12 | from keras.models import Sequential 13 | from keras.layers import Dense 14 | from keras.optimizers import Adam 15 | from keras.models import load_model 16 | from keras.layers import Dense, Dropout, Activation 17 | 18 | from snake.environment import Environment 19 | 20 | from tensorboardX import SummaryWriter 21 | 22 | GAMMA = 0.9 # try .99 23 | LEARNING_RATE = 0.001 # deafult was 0.001 24 | LEARNING_WITH_DECAY = 0.01 25 | 26 | MEMORY_SIZE = 1000000 27 | BATCH_SIZE = 20 28 | 29 | EXPLORATION_MAX = 1 30 | EXPLORATION_MIN = 0.01 31 | EXPLORATION_DECAY = 0.99995 32 | 33 | ##################################################################################################### 34 | NAME = 'snake_dqn/-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization' 35 | WRITE = True 36 | DRAW = False 37 | SAVE_MODEL = True 38 | LOAD_NAME = 'snake_dqn/-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=0' 39 | ##################################################################################################### 40 | 41 | 42 | 43 | class DQNSolver: 44 | 45 | def __init__(self, observation_space, action_space, model = None): 46 | self.exploration_rate = EXPLORATION_MAX 47 | 48 | self.action_space = action_space 49 | self.memory = deque(maxlen=MEMORY_SIZE) 50 | 51 | if model is None: 52 | print('new model') 53 | self.model = Sequential() 54 | self.model.add(Dense(512, input_shape=(observation_space,), activation="relu")) # andere aktivierungs funktion 55 | self.model.add(Dense(512, activation="relu")) 56 | # self.model.add(Dropout(0.85)) 57 | # self.model.add(Dense(512, activation="relu")) 58 | self.model.add(Dense(self.action_space, activation="linear")) # Linear sucks? maybe try softmax 59 | self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE)) # Try learning rate deacy 60 | # self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_WITH_DECAY, decay=1e-6)) 61 | else: 62 | print('saved model loaded') 63 | self.model = model 64 | 65 | def remember(self, state, action, reward, next_state, done): 66 | self.memory.append((state, action, reward, next_state, done)) 67 | 68 | def act(self, state, env): 69 | if np.random.rand() < self.exploration_rate: 70 | return env.get_action_random() 71 | q_values = self.model.predict(state) 72 | return np.argmax(q_values[0]) 73 | 74 | def act_free(self, state): 75 | q_values = self.model.predict(state) 76 | return np.argmax(q_values[0]) 77 | 78 | def experience_replay(self): 79 | if len(self.memory) < BATCH_SIZE: 80 | return 81 | batch = random.sample(self.memory, BATCH_SIZE) 82 | for state, action, reward, state_next, terminal in batch: 83 | q_update = reward 84 | if not terminal: 85 | q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0])) 86 | q_values = self.model.predict(state) 87 | q_values[0][action] = q_update 88 | self.model.fit(state, q_values, verbose=0) 89 | self.exploration_rate *= EXPLORATION_DECAY 90 | self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate) 91 | 92 | 93 | def learn_snake(): 94 | env = Environment(DRAW, 1, False) 95 | writer = None 96 | if WRITE: 97 | writer = SummaryWriter(comment=NAME) 98 | observation_space = env.get_observation_size_buffer() 99 | action_space = env.get_action_size() 100 | 101 | #model = load_model('models/dqn/newenv/{}.h5'.format(LOAD_NAME)) 102 | dqn_solver = DQNSolver(observation_space, action_space) #, model) 103 | run = 0 104 | if SAVE_MODEL: 105 | name = '{}-PART={}'.format(NAME, run) 106 | dqn_solver.model.save('models/dqn/{}.h5'.format(name)) 107 | while True: 108 | run += 1 109 | state = env.reset() 110 | state = np.reshape(state, [1, observation_space]) 111 | step = 0 112 | reward_score = 0 113 | while True: 114 | step += 1 115 | action = dqn_solver.act(state, env) 116 | state_next, reward, terminal, info = env.step_buffer(action) 117 | reward_score += reward 118 | state_next = np.reshape(state_next, [1, observation_space]) 119 | dqn_solver.remember(state, action, reward, state_next, terminal) 120 | state = state_next 121 | if terminal: 122 | print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(reward_score)) 123 | if WRITE: 124 | writer.add_scalar("reward", reward_score, run) 125 | break 126 | dqn_solver.experience_replay() 127 | if (run % 100 == 0) and SAVE_MODEL: 128 | name = '{}-PART={}'.format(NAME, run) 129 | dqn_solver.model.save('models/dqn/{}.h5'.format(name)) 130 | if WRITE: 131 | writer.close() 132 | 133 | 134 | 135 | def play_snake(): 136 | env = Environment(True, 1, False) 137 | 138 | observation_space = env.get_observation_size_buffer() 139 | action_space = env.get_action_size() 140 | 141 | model = keras.models.load_model('models/dqn/{}.h5'.format(LOAD_NAME)) 142 | dqn_solver = DQNSolver(observation_space, action_space, model) 143 | 144 | for i in range(20): 145 | state = env.reset() 146 | state = np.reshape(state, [1, observation_space]) 147 | is_done = False 148 | while not is_done: 149 | action = dqn_solver.act_free(state) 150 | # action = env.get_action_random() 151 | state_next, reward, terminal, info = env.step_buffer(action) 152 | is_done = terminal 153 | state = np.reshape(state_next, [1, observation_space]) 154 | 155 | 156 | 157 | if __name__ == "__main__": 158 | # learn_snake() 159 | play_snake() 160 | 161 | print('Jobe Done!') 162 | -------------------------------------------------------------------------------- /old_agents/q_iteration.py: -------------------------------------------------------------------------------- 1 | # value_iteration does the same but converges faster 2 | 3 | import gym 4 | from flappyb.environment import Environment 5 | import collections 6 | from tensorboardX import SummaryWriter 7 | import random 8 | import numpy as np 9 | 10 | GAMMA = 0.9 11 | TEST_EPISODES = 5 12 | TRAINING_STEPS = 3000 13 | 14 | WRITE = False 15 | DRAW_TRAINING = False 16 | DRAW = False 17 | NAME = 'q-iteration-gamma:0.2-trainingsteps:3000-newenv-roundto:1' 18 | 19 | 20 | class Agent: 21 | def __init__(self): 22 | # self.env = gym.make(ENV_NAME) 23 | self.env = Environment(DRAW_TRAINING) 24 | self.state = self.env.reset() 25 | self.rewards = collections.defaultdict(float) 26 | self.transits = collections.defaultdict(collections.Counter) 27 | self.values = collections.defaultdict(float) 28 | 29 | def play_n_random_steps(self, count): 30 | # rand = random.uniform(0.2, 0.8) # more or less and he does nothing 31 | for _ in range(count): 32 | # if _ % 1000 == 0: 33 | # rand = random.uniform(0.2, 0.8) 34 | # print(rand) 35 | # action = np.random.choice((0, 1), 1, p=(rand, 1 - rand)) 36 | # action = action.item(0) 37 | action = self.env.get_action_random() 38 | new_state, reward, is_done, _ = self.env.step(action) 39 | self.rewards[(self.state, action, new_state)] = reward 40 | self.transits[(self.state, action)][new_state] += 1 41 | self.state = self.env.reset() if is_done else new_state 42 | # print(len(self.transits)) 43 | 44 | def select_action(self, state): 45 | best_action, best_value = None, None 46 | for action in range(self.env.get_action_size()): 47 | action_value = self.values[(state, action)] 48 | if best_value is None or best_value < action_value: 49 | best_value = action_value 50 | best_action = action 51 | return best_action 52 | 53 | def play_episode(self, env): 54 | total_reward = 0.0 55 | state = env.reset() 56 | while True: 57 | action = self.select_action(state) 58 | new_state, reward, is_done, _ = env.step(action) 59 | self.rewards[(state, action, new_state)] = reward 60 | self.transits[(state, action)][new_state] += 1 61 | total_reward += reward 62 | if is_done: 63 | break 64 | state = new_state 65 | return total_reward 66 | 67 | def value_iteration(self): 68 | for state in range(self.env.get_observation_size()): 69 | for action in range(self.env.get_action_size()): 70 | action_value = 0.0 71 | target_counts = self.transits[(state, action)] 72 | total = sum(target_counts.values()) 73 | for tgt_state, count in target_counts.items(): 74 | reward = self.rewards[(state, action, tgt_state)] 75 | best_action = self.select_action(tgt_state) 76 | action_value += (count / total) * (reward + GAMMA * self.values[(tgt_state, best_action)]) 77 | self.values[(state, action)] = action_value 78 | 79 | 80 | if __name__ == "__main__": 81 | test_env = Environment(DRAW) 82 | agent = Agent() 83 | writer = None 84 | if WRITE: 85 | writer = SummaryWriter(comment=NAME) 86 | 87 | iter_no = 0 88 | best_reward = 0.0 89 | while True: 90 | iter_no += 1 91 | print('#', iter_no) 92 | agent.play_n_random_steps(TRAINING_STEPS) 93 | agent.value_iteration() 94 | 95 | reward = 0.0 96 | for _ in range(TEST_EPISODES): 97 | reward += agent.play_episode(test_env) 98 | reward /= TEST_EPISODES 99 | if WRITE: 100 | writer.add_scalar("reward", reward, iter_no) 101 | if reward > best_reward: 102 | print("Best reward updated %.3f -> %.3f" % (best_reward, reward)) 103 | best_reward = reward 104 | if reward > 0.80: 105 | print("Solved in %d iterations!" % iter_no) 106 | break 107 | if WRITE: 108 | writer.close() 109 | -------------------------------------------------------------------------------- /old_agents/q_learning.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import collections 3 | from tensorboardX import SummaryWriter 4 | from flappyb.environment import Environment 5 | 6 | GAMMA = 0.9 7 | ALPHA = 0.2 8 | TEST_EPISODES = 20 9 | 10 | NAME = 'q-learning' 11 | WRITE = False 12 | DRAW_TRAINING = False 13 | DRAW = False 14 | 15 | class Agent: 16 | def __init__(self): 17 | self.env = Environment(DRAW_TRAINING) 18 | self.state = self.env.reset() 19 | self.values = collections.defaultdict(float) # less memory wasted, only store q-values 20 | 21 | # get s, a, r ,ns 22 | def sample_env(self): 23 | action = self.env.get_action_random() 24 | old_state = self.state 25 | new_state, reward, is_done, _ = self.env.step(action) 26 | self.state = self.env.reset() if is_done else new_state 27 | return old_state, action, reward, new_state 28 | 29 | # iterate over all action values and return the best one 30 | def best_value_and_action(self, state): 31 | best_value, best_action = None, None 32 | for action in range(self.env.get_action_size()): 33 | action_value = self.values[(state, action)] 34 | if best_value is None or best_value < action_value: 35 | best_value = action_value 36 | best_action = action 37 | return best_value, best_action 38 | 39 | # q-value is calculated for s, a and stored in table 40 | def value_update(self, s, a, r, next_s): 41 | best_v, _ = self.best_value_and_action(next_s) 42 | new_val = r + GAMMA * best_v 43 | old_val = self.values[(s, a)] 44 | self.values[(s, a)] = old_val * (1 - ALPHA) + new_val * ALPHA 45 | 46 | # value table is not altered, only measures agent 47 | def play_episode(self, env): 48 | total_reward = 0.0 49 | state = env.reset() 50 | while True: 51 | _, action = self.best_value_and_action(state) 52 | new_state, reward, is_done, _ = env.step(action) 53 | total_reward += reward 54 | if is_done: 55 | break 56 | state = new_state 57 | return total_reward 58 | 59 | if __name__ == "__main__": 60 | test_env = Environment(DRAW) 61 | agent = Agent() 62 | writer = None 63 | if WRITE: 64 | writer = SummaryWriter(comment=NAME) 65 | iter_no = 0 66 | best_reward = 0.0 67 | while True: 68 | iter_no += 1 69 | print('#', iter_no) 70 | s, a, r, next_s = agent.sample_env() 71 | agent.value_update(s, a, r, next_s) 72 | 73 | reward = 0.0 74 | for _ in range(TEST_EPISODES): 75 | reward += agent.play_episode(test_env) 76 | reward /= TEST_EPISODES 77 | 78 | if WRITE: 79 | writer.add_scalar('reward', reward, iter_no) 80 | if reward > best_reward: 81 | print('Best reward updated %.3f => %.3f' %(best_reward, reward)) 82 | best_reward = reward 83 | if reward > 0.9: 84 | print('Solved in %d iterations' %iter_no) 85 | break 86 | if WRITE: 87 | writer.close() -------------------------------------------------------------------------------- /old_agents/value_iteration.py: -------------------------------------------------------------------------------- 1 | # This sucks for flappy B 2 | # CURRENTLY NO ROUNDING 3 | # saving not needed because it just sucks 4 | # Take this for presentation, not q-learning or q-iteration 5 | 6 | import collections 7 | from tensorboardX import SummaryWriter 8 | from flappyb.environment import Environment 9 | import random 10 | import numpy as np 11 | 12 | GAMMA = .9 13 | TEST_EPISODES = 5 14 | TRAINING_STEPS = 3000 15 | 16 | # NAME = 'gamma=0.9-trainingsteps:3000-rounding=None' 17 | NAME = 'gamma=0.9-trainingsteps:3000-rounding=2' 18 | WRITE = True 19 | DRAW_TRAINING = False 20 | DRAW = False 21 | 22 | 23 | class Agent: 24 | def __init__(self): 25 | self.env = Environment(DRAW_TRAINING) 26 | self.state = self.env.reset() 27 | self.rewards = collections.defaultdict(float) 28 | self.transits = collections.defaultdict(collections.Counter) 29 | self.values = collections.defaultdict(float) 30 | 31 | def play_n_random_steps(self, count): 32 | for _ in range(count): 33 | action = self.env.get_action_random() 34 | new_state, reward, is_done, _ = self.env.step(action) 35 | self.rewards[(self.state, action, new_state)] = reward 36 | self.transits[(self.state, action)][new_state] += 1 37 | self.state = self.env.reset() if is_done else new_state 38 | 39 | def calc_action_value(self, state, action): 40 | target_counts = self.transits[(state, action)] 41 | total = sum(target_counts.values()) 42 | action_value = 0.0 43 | for tgt_state, count in target_counts.items(): 44 | reward = self.rewards[(state, action, tgt_state)] 45 | action_value += (count / total) * (reward + GAMMA * self.values[tgt_state]) 46 | return action_value 47 | 48 | def select_action(self, state): 49 | best_action, best_value = None, None 50 | # for action in range(self.env.action_space.n): 51 | for action in range(self.env.get_action_size()): 52 | action_value = self.calc_action_value(state, action) 53 | if best_value is None or best_value < action_value: 54 | best_value = action_value 55 | best_action = action 56 | return best_action 57 | 58 | def play_episode(self, env): 59 | total_reward = 0.0 60 | state = env.reset() 61 | while True: 62 | action = self.select_action(state) 63 | new_state, reward, is_done, _ = env.step(action) 64 | self.rewards[(state, action, new_state)] = reward 65 | self.transits[(state, action)][new_state] += 1 66 | total_reward += reward 67 | if is_done: 68 | break 69 | state = new_state 70 | return total_reward 71 | 72 | def value_iteration(self): 73 | for state in range(self.env.get_observation_size()): 74 | state_values = [self.calc_action_value(state, action) for action in range(self.env.get_action_size())] 75 | self.values[state] = max(state_values) 76 | 77 | 78 | if __name__ == "__main__": 79 | test_env = Environment(DRAW) 80 | agent = Agent() 81 | writer = None 82 | if WRITE: 83 | writer = SummaryWriter(comment='v_iteration/{}'.format(NAME)) 84 | 85 | iter_no = 0 86 | best_reward = 0.0 87 | while True: 88 | iter_no += 1 89 | print('#', iter_no) 90 | agent.play_n_random_steps(TRAINING_STEPS) 91 | agent.value_iteration() 92 | 93 | reward = 0.0 94 | for _ in range(TEST_EPISODES): 95 | reward += agent.play_episode(test_env) 96 | reward /= TEST_EPISODES 97 | if WRITE: 98 | writer.add_scalar("reward", reward, iter_no) 99 | if reward > best_reward: 100 | print("Best reward updated %.3f -> %.3f" % (best_reward, reward)) 101 | best_reward = reward 102 | if reward > 500: 103 | print("Solved in %d iterations!" % iter_no) 104 | break 105 | if WRITE: 106 | writer.close() 107 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.15.4 2 | atari-py==0.1.6 3 | gym==0.10.9 4 | ptan==0.3 5 | opencv-python==3.4.3.18 6 | scipy==1.1.0 7 | torch==0.4.1 8 | torchvision==0.2.1 9 | tensorboard-pytorch==0.7.1 10 | tensorflow==1.12.0 11 | tensorboard==1.12.0 12 | pybullet==2.3.6 13 | matplotlib==3.0.2 14 | pygame 15 | 16 | Some things are missing, sorry 17 | -------------------------------------------------------------------------------- /runTensorBoard: -------------------------------------------------------------------------------- 1 | tensorboard --logdir $1 --host localhost 2 | -------------------------------------------------------------------------------- /snake/base_ppo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | 4 | from environment.environment import SnakeEnvironment 5 | 6 | from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv 7 | from stable_baselines.common.policies import MlpPolicy 8 | from stable_baselines import PPO1 9 | 10 | 11 | if __name__ == "__main__": 12 | 13 | env = SnakeEnvironment(draw=True, fps=100, debug=False, animation=False) 14 | # env = SubprocVecEnv([lambda: env]) 15 | env = DummyVecEnv([lambda: env]) 16 | 17 | # model = PPO1(MlpPolicy, env, verbose=1) 18 | 19 | # model.learn(total_timesteps=500000) 20 | # model.save('models/snake-bastard') 21 | 22 | ############################################################################### 23 | 24 | # env = gym.make('snake-v0') 25 | # # env = DummyVecEnv([lambda: env]) 26 | 27 | # # model = PPO2(MlpPolicy, env, verbose=1) 28 | # # model.load('models/snake-basterd') 29 | 30 | obs = env.reset() 31 | is_done = False 32 | 33 | while not is_done: 34 | # action, _states = model.predict(obs) 35 | action = random.randint(0, 4) 36 | obs, rewards, terminal, info = env.step(action) 37 | is_done = terminal 38 | -------------------------------------------------------------------------------- /snake/env_new/cube.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | 3 | 4 | class Cube(object): 5 | 6 | def __init__(self, pos, rows, w, dirnx=1, dirny=0, color=(255, 0, 0)): 7 | 8 | self.pos = pos 9 | self.dirnx = dirnx 10 | self.dirny = dirny 11 | 12 | self.rows = rows 13 | self.w = w 14 | 15 | self.color = color 16 | 17 | def move(self, dirnx, dirny): 18 | self.dirnx = dirnx 19 | self.dirny = dirny 20 | self.pos = (self.pos[0] + self.dirnx, self.pos[1] + self.dirny) 21 | 22 | def draw(self, surface, eyes=False): 23 | dis = self.w // self.rows 24 | i = self.pos[0] 25 | j = self.pos[1] 26 | 27 | pygame.draw.rect(surface, self.color, (i*dis+1,j*dis+1, dis-2, dis-2)) 28 | if eyes: 29 | centre = dis//2 30 | radius = 3 31 | circleMiddle = (i*dis+centre-radius,j*dis+8) 32 | circleMiddle2 = (i*dis + dis -radius*2, j*dis+8) 33 | pygame.draw.circle(surface, (0,0,0), circleMiddle, radius) 34 | pygame.draw.circle(surface, (0,0,0), circleMiddle2, radius) -------------------------------------------------------------------------------- /snake/env_new/environment.py: -------------------------------------------------------------------------------- 1 | from environment.cube import Cube 2 | from environment.snake import Snake 3 | 4 | import gym 5 | import pygame 6 | 7 | import numpy as np 8 | import random 9 | import enum 10 | import time 11 | 12 | #import tkinker as tk 13 | #from tkinter import messagebox 14 | 15 | # snake obs 16 | # body = head 0.9, b[0] = 0.8, b[1] = 0.79 ... 17 | 18 | 19 | W = 500 20 | H = 500 21 | BUFFER_SIZE = 1 22 | 23 | 24 | class Actions(enum.Enum): 25 | Up = 0 26 | Right = 1 27 | Down = 2 28 | Left = 3 29 | 30 | 31 | class SnakeEnvironment(gym.Env): 32 | 33 | def __init__(self, draw=True, speed=10000, rows=20, animation=True): 34 | super(SnakeEnvironment, self).__init__() 35 | 36 | self.observation_space = gym.spaces.Discrete(n=rows * rows) 37 | self.action_space = gym.spaces.Discrete(n=len(Actions)) 38 | 39 | self.draw = draw 40 | self.speed = speed 41 | self.rows = rows 42 | self.animation = animation 43 | 44 | self.snake = Snake((255, 0, 0), (2, 2), self.rows, W) 45 | self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0)) 46 | 47 | self.is_done = False 48 | self.reward = 0 49 | self.step_without_apple = 0 50 | 51 | self.surf = pygame.display.set_mode((W, H)) 52 | self.clock = pygame.time.Clock() 53 | 54 | if draw: 55 | pygame.init() 56 | self.font_game_over = pygame.font.SysFont("ani", 72) 57 | 58 | """ Must alwasy be calles in the beginning. """ 59 | def reset(self): 60 | self.countdown() 61 | 62 | self.snake.reset((2, 2)) 63 | self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0)) 64 | self.is_done = False 65 | self.reward = 0 66 | self.step_without_apple = 0 67 | 68 | self.surf = pygame.display.set_mode((W, H)) 69 | self.clock = pygame.time.Clock() 70 | 71 | obs, reward, is_done, _ = self.step(1) 72 | 73 | return obs 74 | 75 | def step(self, action): 76 | pygame.time.delay(50) # lower is faster 77 | self.clock.tick(self.speed) # lower is slower 78 | 79 | if isinstance(action, np.ndarray): 80 | idx = -1 81 | highest_idx = 0 82 | highest_val = -1 83 | for i in action: 84 | idx += 1 85 | if i > highest_val: 86 | highest_idx = idx 87 | highest_val = i 88 | action = highest_idx 89 | 90 | current_reward = 0 91 | 92 | self.snake.move_ai(action) 93 | # self.snake.move_human() 94 | 95 | if self.snake.ate_itself(): 96 | current_reward = -1 97 | self.game_over() 98 | 99 | self.step_without_apple += 1 100 | if self.step_without_apple == 250: 101 | self.game_over() 102 | 103 | if self.snake.body[0].pos == self.snack.pos: 104 | self.snake.add_cube() 105 | self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0)) 106 | self.reward += 1 107 | current_reward = 1 108 | self.step_without_apple = 0 109 | 110 | self.redraw_window() 111 | 112 | obs = self.get_observation_space() 113 | 114 | return obs, current_reward, self.is_done, None 115 | 116 | def get_observation_space(self): 117 | 118 | new_obs = [] 119 | 120 | # create 2d matrix 121 | for i in range(self.rows): 122 | new_obs.append([]) 123 | for j in range(self.rows): 124 | new_obs[i].append(-1) 125 | 126 | # add apple 127 | x_apple = self.snack.pos[0] 128 | y_apple = self.snack.pos[1] 129 | new_obs[y_apple][x_apple] = 1 130 | 131 | # add snake 132 | x_snake = self.snake.head.pos[0] 133 | y_snake = self.snake.head.pos[1] 134 | if x_snake == -1 or x_snake == self.rows: 135 | print('Wtf, this error occured!') 136 | self.game_over() 137 | return 138 | if y_snake == -1 or y_snake == self.rows: 139 | print('Wtf, this error occured!') 140 | self.game_over() 141 | return 142 | new_obs[y_snake][x_snake] = 0.8 143 | 144 | # tail 145 | for i, c in enumerate(self.snake.body): 146 | x_snake = c.pos[0] 147 | y_snake = c.pos[1] 148 | 149 | if x_snake == -1 or x_snake == self.rows: 150 | print('Wtf, this error occured!') 151 | self.game_over() 152 | return 153 | if y_snake == -1 or y_snake == self.rows: 154 | print('Wtf, this error occured!') 155 | self.game_over() 156 | return 157 | 158 | new_obs[y_snake][x_snake] = 0.5 159 | 160 | current_obs = [] 161 | for i in new_obs: 162 | for j in i: 163 | current_obs.append(j) 164 | 165 | # cnt = 0 166 | # for i in current_obs: 167 | # cnt += 1 168 | # print(' ', i, ' ', end='') 169 | # if cnt % self.rows == 0: 170 | # print('') 171 | # print('') 172 | 173 | return_obs = np.array(current_obs) 174 | 175 | return return_obs 176 | 177 | def draw_grid(self): 178 | size_btwn = W // self.rows 179 | 180 | x = 0 181 | y = 0 182 | 183 | for i in range(self.rows): 184 | x = x + size_btwn 185 | y = y + size_btwn 186 | 187 | pygame.draw.line(self.surf, (255, 255, 255), (x, 0), (x, W)) 188 | pygame.draw.line(self.surf, (255, 255, 255), (0, y), (W, y)) 189 | 190 | def redraw_window(self): 191 | if not self.draw: 192 | return 193 | 194 | self.surf.fill((0, 0, 0)) 195 | self.draw_grid() 196 | self.snake.draw(self.surf) 197 | self.snack.draw(self.surf) 198 | 199 | pygame.display.update() 200 | 201 | def random_snack(self): 202 | positions = self.snake.body 203 | 204 | while True: 205 | x = random.randrange(self.rows) 206 | y = random.randrange(self.rows) 207 | if len(list(filter(lambda z:z.pos == (x,y), positions))) > 0: 208 | continue 209 | else: 210 | break 211 | return (x,y) 212 | 213 | def countdown(self): 214 | if not self.draw or not self.animation: 215 | return 216 | for _ in range(3, 0, -1): 217 | self.write_text("Start in {}".format(_)) 218 | time.sleep(0.3) 219 | 220 | def game_over(self): 221 | self.is_done = True 222 | if not self.draw or not self.animation: 223 | return 224 | self.write_text("Score {}".format(self.reward)) 225 | time.sleep(1.5) 226 | 227 | def write_text(self, text): 228 | self.redraw_window() 229 | text_start = pygame.font.SysFont('dyuthi', 80). \ 230 | render(text, True, (255, 255, 255)) 231 | self.surf.blit(text_start, 232 | (text_start.get_width() // 233 | 2, text_start.get_height() // 2)) 234 | pygame.display.flip() 235 | 236 | def play_human(self): 237 | self.countdown() 238 | 239 | while(not self.is_done): 240 | pygame.time.delay(50) # lower is faster 241 | self.clock.tick(self.speed) # lower is slower 242 | 243 | self.snake.move_human() 244 | 245 | if self.snake.ate_itself(): 246 | self.game_over() 247 | 248 | if self.snake.body[0].pos == self.snack.pos: 249 | self.snake.add_cube() 250 | self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0)) 251 | self.reward += 1 252 | 253 | self.redraw_window() 254 | self.get_observation_space() 255 | 256 | 257 | if __name__ == "__main__": 258 | env = SnakeEnvironment(draw=True, speed=100, rows=5) 259 | env.play_human() 260 | 261 | 262 | 263 | 264 | ####### 265 | # if self.last_observation == None: 266 | # self.last_observation = current_obs 267 | 268 | # return_obs = [] 269 | 270 | # for i in self.last_observation: 271 | # return_obs.append(i) 272 | # for i in current_obs: 273 | # return_obs.append(i) 274 | 275 | # return_obs = np.array(return_obs) 276 | 277 | # cnt = 0 278 | # for i in return_obs: 279 | # cnt += 1 280 | # print(' ', i, ' ', end='') 281 | # if cnt % 10 == 0: 282 | # print('') 283 | # if cnt % 100 == 0: 284 | # print('') 285 | # print('') 286 | # print('') 287 | 288 | # self.last_observation = current_obs 289 | ####### 290 | -------------------------------------------------------------------------------- /snake/env_new/self_play.py: -------------------------------------------------------------------------------- 1 | # from environment.environment import SnakeEnvironment 2 | 3 | env = SnakeEnvironment(draw=True, speed=100000, rows=5) 4 | 5 | env.reset() 6 | terminal = False 7 | 8 | while not terminal: 9 | action = random.randint(0, 4) 10 | next_state, reward, is_done, _ = env.step(action) 11 | terminal = is_done 12 | -------------------------------------------------------------------------------- /snake/env_new/snake.py: -------------------------------------------------------------------------------- 1 | from environment.cube import Cube 2 | 3 | import pygame 4 | 5 | 6 | class Snake(object): 7 | 8 | body = [] 9 | turns = {} 10 | 11 | def __init__(self, color, pos, rows, w): 12 | self.head = Cube(pos, rows, w) 13 | self.body.append(self.head) 14 | 15 | self.rows = rows 16 | self.w = w 17 | 18 | self.color = color 19 | 20 | self.dirnx = 0 21 | self.dirny = 0 22 | 23 | self.add_cube() 24 | self.add_cube() 25 | 26 | def move_ai(self, action): 27 | x = self.head.pos[0] 28 | y = self.head.pos[1] 29 | 30 | if y == 0 and action == 0: 31 | action = -1 32 | elif x == self.rows -1 and action == 1: 33 | action = -1 34 | elif y == self.rows -1 and action == 2: 35 | action = -1 36 | elif x == 0 and action == 3: 37 | action = -1 38 | 39 | if action == -1: 40 | pass 41 | elif action == 0: 42 | self.dirnx = 0 43 | self.dirny = -1 44 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 45 | elif action == 1: 46 | self.dirnx = 1 47 | self.dirny = 0 48 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 49 | elif action == 2: 50 | self.dirnx = 0 51 | self.dirny = 1 52 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 53 | elif action == 3: 54 | self.dirnx = -1 55 | self.dirny = 0 56 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 57 | 58 | for i, c in enumerate(self.body): 59 | p = c.pos[:] 60 | if p in self.turns: 61 | turn = self.turns[p] 62 | c.move(turn[0],turn[1]) 63 | if i == len(self.body)-1: 64 | self.turns.pop(p) 65 | else: 66 | if c.dirnx == -1 and c.pos[0] <= 0: c.pos = (c.rows-1, c.pos[1]) 67 | elif c.dirnx == 1 and c.pos[0] >= c.rows-1: c.pos = (0,c.pos[1]) 68 | elif c.dirny == 1 and c.pos[1] >= c.rows-1: c.pos = (c.pos[0], 0) 69 | elif c.dirny == -1 and c.pos[1] <= 0: c.pos = (c.pos[0],c.rows-1) 70 | else: c.move(c.dirnx,c.dirny) 71 | 72 | def move_human(self): 73 | for event in pygame.event.get(): 74 | if event.type == pygame.QUIT: 75 | pygame.quit() 76 | 77 | keys = pygame.key.get_pressed() 78 | for key in keys: 79 | if keys[pygame.K_UP]: 80 | self.dirnx = 0 81 | self.dirny = -1 82 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 83 | elif keys[pygame.K_RIGHT]: 84 | self.dirnx = 1 85 | self.dirny = 0 86 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 87 | elif keys[pygame.K_DOWN]: 88 | self.dirnx = 0 89 | self.dirny = 1 90 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 91 | elif keys[pygame.K_LEFT]: 92 | self.dirnx = -1 93 | self.dirny = 0 94 | self.turns[self.head.pos[:]] = [self.dirnx, self.dirny] 95 | 96 | for i, c in enumerate(self.body): 97 | p = c.pos[:] 98 | if p in self.turns: 99 | turn = self.turns[p] 100 | c.move(turn[0],turn[1]) 101 | if i == len(self.body)-1: 102 | self.turns.pop(p) 103 | else: 104 | if c.dirnx == -1 and c.pos[0] <= 0: c.pos = (c.rows-1, c.pos[1]) 105 | elif c.dirnx == 1 and c.pos[0] >= c.rows-1: c.pos = (0,c.pos[1]) 106 | elif c.dirny == 1 and c.pos[1] >= c.rows-1: c.pos = (c.pos[0], 0) 107 | elif c.dirny == -1 and c.pos[1] <= 0: c.pos = (c.pos[0],c.rows-1) 108 | else: c.move(c.dirnx,c.dirny) 109 | 110 | def ate_itself(self): 111 | head = True 112 | for i, c in enumerate(self.body): 113 | if self.head.pos == c.pos and not head: 114 | return True 115 | head = False 116 | 117 | def reset(self, pos): 118 | self.head = Cube(pos, self.rows, self.w) 119 | self.body = [] 120 | self.body.append(self.head) 121 | self.turns = {} 122 | self.dirnx = 0 123 | self.dirny = 1 124 | self.add_cube() 125 | self.add_cube() 126 | 127 | def add_cube(self): 128 | tail = self.body[-1] 129 | dx, dy = tail.dirnx, tail.dirny 130 | 131 | if dx == 1 and dy == 0: 132 | self.body.append(Cube((tail.pos[0] -1, tail.pos[1]), self.rows, self.w)) 133 | elif dx == -1 and dy == 0: 134 | self.body.append(Cube((tail.pos[0] +1, tail.pos[1]), self.rows, self.w)) 135 | elif dx == 0 and dy == 1: 136 | self.body.append(Cube((tail.pos[0], tail.pos[1] -1), self.rows, self.w)) 137 | elif dx == 0 and dy == -1: 138 | self.body.append(Cube((tail.pos[0], tail.pos[1] +1), self.rows, self.w)) 139 | 140 | self.body[-1].dirnx = dx 141 | self.body[-1].dirny = dy 142 | 143 | def draw(self, surface): 144 | for i, c in enumerate(self.body): 145 | if i == 0: 146 | c.draw(surface, True) 147 | else: 148 | c.draw(surface) 149 | -------------------------------------------------------------------------------- /snake/environment/apple.py: -------------------------------------------------------------------------------- 1 | 2 | # https://www.youtube.com/watch?v=AaGK-fj-BAM&t=630s 3 | import pygame 4 | import random 5 | 6 | 7 | class Apple: 8 | 9 | def __init__(self, screen, s_width, s_height, color, scale): 10 | 11 | self.screen = screen 12 | self.s_width = s_width 13 | self.s_height = s_height 14 | self.color = color 15 | self.scale = scale 16 | 17 | self.place_apple(None) 18 | 19 | def draw(self): 20 | rect = pygame.rect.Rect(self.x, self.y, self.scale, self.scale) 21 | pygame.draw.rect(self.screen, self.color, rect) 22 | 23 | def eat(self, snake_x, snake_y, tail): 24 | if self.x == snake_x and self.y == snake_y: 25 | self.place_apple(tail) 26 | return True 27 | return False 28 | 29 | def place_apple(self, tail): 30 | 31 | cols = (self.s_width - self.scale) / self.scale 32 | rows = (self.s_height - self.scale) / self.scale 33 | 34 | rand_x = 0 35 | rand_y = 0 36 | 37 | bad_position = True 38 | 39 | if tail is None: 40 | bad_position = False 41 | rand_x = random.randint(0, cols) 42 | rand_y = random.randint(0, rows) 43 | 44 | while bad_position: 45 | bad_position = False 46 | 47 | rand_x = random.randint(0, cols) 48 | rand_y = random.randint(0, rows) 49 | 50 | for i in tail: 51 | if rand_x == int(i.x / self.scale) and rand_y == int(i.y / self.scale): 52 | bad_position = True 53 | break 54 | 55 | self.x = rand_x * self.scale 56 | self.y = rand_y * self.scale 57 | -------------------------------------------------------------------------------- /snake/environment/environment.py: -------------------------------------------------------------------------------- 1 | # Game was made with the help of https://www.youtube.com/watch?v=cXgA1d_E-jY 2 | import gym 3 | import gym.spaces 4 | import time 5 | import pygame 6 | import random 7 | import enum 8 | 9 | import numpy as np 10 | 11 | from environment.snake import Snake 12 | from environment.apple import Apple 13 | 14 | # AI PARAMETERS ############################################################### 15 | BUFFER_SIZE = 1 16 | OBSERVATION_SIZE = 10 * 10 17 | ACTIONS = [0, 1, 2, 3] 18 | ACTION_SIZE = 4 19 | 20 | # GAME PARAMETERS ############################################################# 21 | SCALE = 60 22 | SCREEN_SIZE = WIDTH, HEIGHT = (600, 600) # for 5*5 go 300*300, 60 23 | # for 10*10 go 600*600, 60 24 | BACKGROUND = (72, 72, 72) 25 | SNAKE_COLOR = (57, 255, 20) 26 | APPPLE_COLOR = (255, 8, 0) 27 | FONT = 'dyuthi' 28 | 29 | """ Rewards 30 | 1. first apple +1 31 | 2. every next apple n+1 32 | 3. hit wall -1 33 | 4. ate self -2 34 | 5. does nothing 0.1 35 | """ 36 | """ Observations 37 | 1. apple +1 38 | 3. snake head = 0.5 39 | 4. every snake body -0.01 40 | 5. emtpy cell = -1 41 | """ 42 | """ 43 | Interace: 44 | reset(): resets the whole environment 45 | step(action): performs one action onto the environment 46 | step_buffer(action): performs one action onto the environment, 47 | returns 4 states for experience replay 48 | get_action_random(): obtain an imporoved random action 49 | get_observation_size(): obtain size of observation 50 | get_action_size(): obtain size of action 51 | """ 52 | 53 | 54 | class Actions(enum.Enum): 55 | Up = 0 56 | Right = 1 57 | Down = 2 58 | Left = 3 59 | 60 | 61 | class SnakeEnvironment(gym.Env): 62 | 63 | def __init__(self, draw=True, fps=100, debug=False, animation=False): 64 | 65 | super(SnakeEnvironment, self).__init__() 66 | self.observation_space = gym.spaces.Discrete(n=OBSERVATION_SIZE*BUFFER_SIZE) 67 | self.action_space = gym.spaces.Discrete(n=len(Actions)) 68 | 69 | if draw: 70 | pygame.init() 71 | pygame.display.set_caption('NN Snake') 72 | self.font_game_over = pygame.font.SysFont("ani", 72) 73 | 74 | self.draw = draw 75 | self.fps = fps 76 | self.debug = debug 77 | self.animation = animation 78 | self.screen = pygame.display.set_mode(SCREEN_SIZE) 79 | 80 | self.reward = 0 81 | self.score = 0 82 | self.is_done = False 83 | self.steps_without_apple = 0 84 | 85 | self.current_observation = None 86 | self.last_observation = None 87 | 88 | # ML INTERFACE ############################################################ 89 | def reset(self): 90 | """ Resets the whole environment. Must be called in the beginning. """ 91 | 92 | self.snake = Snake(self.screen, WIDTH, HEIGHT, SNAKE_COLOR, 93 | BACKGROUND, SCALE) 94 | self.apple = Apple(self.screen, WIDTH, HEIGHT, APPPLE_COLOR, SCALE) 95 | 96 | self.reward = 0 97 | self.score = 0 98 | self.is_done = False 99 | self.steps_without_apple = 0 100 | 101 | self.current_observation = None 102 | self.last_observation = None 103 | 104 | obs, reward, is_done, _ = self.step(1) 105 | 106 | if self.draw: 107 | self.countdown() 108 | 109 | return obs 110 | 111 | # The actual game step #################################################### 112 | def step(self, action): 113 | 114 | print(action) 115 | 116 | # if isinstance(action, np.ndarray): 117 | # idx = -1 118 | # highest_idx = 0 119 | # highest_val = -1 120 | # for i in action: 121 | # idx += 1 122 | # if i > highest_val: 123 | # highest_idx = idx 124 | # highest_val = i 125 | # action = highest_idx 126 | 127 | current_reward = 0 128 | 129 | self.snake.handle_events_ai(action) 130 | 131 | if self.apple.eat(self.snake.x, self.snake.y, self.snake.tail): 132 | self.snake.update(True) 133 | self.steps_without_apple = 0 134 | self.score += 1 135 | current_reward = 1 136 | # if self.score == 10: 137 | # current_reward = 1 138 | # else: 139 | # current_reward = self.score / 10 140 | else: 141 | self.snake.update(False) 142 | current_reward = 0.1 143 | self.steps_without_apple += 1 144 | # if self.steps_without_apple > 20: 145 | # current_reward = 0 146 | if self.steps_without_apple > 500: 147 | current_reward = -1 148 | self.game_over() 149 | 150 | if self.snake.check_if_hit_wall(): 151 | current_reward = -1 152 | self.game_over() 153 | 154 | if self.snake.check_if_ate_self(): 155 | current_reward = -1 156 | self.game_over() 157 | 158 | if self.draw: 159 | self.screen.fill(BACKGROUND) 160 | self.snake.draw() 161 | self.apple.draw() 162 | pygame.display.update() 163 | 164 | obs = self.get_observation_space() 165 | time.sleep(self.fps / 1000.0) 166 | 167 | return obs, current_reward, self.is_done, None 168 | 169 | def get_observation_space(self): 170 | 171 | new_obs = [] 172 | 173 | # create 2d matrix 174 | for i in range(int(WIDTH / SCALE)): 175 | new_obs.append([]) 176 | for j in range(int(WIDTH / SCALE)): 177 | new_obs[i].append(-1) 178 | 179 | # add apple 180 | x_apple = int(self.apple.x / SCALE) 181 | y_apple = int(self.apple.y / SCALE) 182 | new_obs[y_apple][x_apple] = 1 183 | 184 | # add snake 185 | x_snake = int(self.snake.x / SCALE) 186 | y_snake = int(self.snake.y / SCALE) 187 | new_obs[y_snake][x_snake] = 0.8 188 | 189 | # tail 190 | for i in self.snake.tail: 191 | x_snake = int(i.x / SCALE) 192 | y_snake = int(i.y / SCALE) 193 | new_obs[y_snake][x_snake] = 0.5 194 | 195 | current_obs = [] 196 | for i in new_obs: 197 | for j in i: 198 | current_obs.append(j) 199 | 200 | if self.draw and self.debug: 201 | for i in new_obs: 202 | print(i, '\n') 203 | print('\n') 204 | 205 | return_obs = np.array(current_obs) 206 | 207 | ####### 208 | # if self.last_observation == None: 209 | # self.last_observation = current_obs 210 | 211 | # return_obs = [] 212 | 213 | # for i in self.last_observation: 214 | # return_obs.append(i) 215 | # for i in current_obs: 216 | # return_obs.append(i) 217 | 218 | # return_obs = np.array(return_obs) 219 | 220 | # cnt = 0 221 | # for i in return_obs: 222 | # cnt += 1 223 | # print(' ', i, ' ', end='') 224 | # if cnt % 10 == 0: 225 | # print('') 226 | # if cnt % 100 == 0: 227 | # print('') 228 | # print('') 229 | # print('') 230 | 231 | # self.last_observation = current_obs 232 | ####### 233 | 234 | return return_obs 235 | 236 | def get_action_random(self): 237 | return random.randint(0, 3) 238 | 239 | # HUMAN STUFF ############################################################ 240 | 241 | def reset_human_game(self): 242 | """ Resets the whole environment. Must be called in the beginning. """ 243 | 244 | self.clock = pygame.time.Clock() 245 | self.time_elapsed_since_last_action = 0 246 | self.global_time = 0 247 | 248 | self.screen = pygame.display.set_mode(SCREEN_SIZE) 249 | self.snake = Snake(self.screen, WIDTH, HEIGHT, SNAKE_COLOR, 250 | BACKGROUND, SCALE) 251 | self.apple = Apple(self.screen, WIDTH, HEIGHT, APPPLE_COLOR, SCALE) 252 | 253 | self.reward = 0 254 | self.score = 0 255 | self.is_done = False 256 | self.steps_without_apple = 0 257 | 258 | self.current_observation = None 259 | self.last_observation = None 260 | 261 | if self.draw: 262 | self.countdown() 263 | 264 | def run_human_game(self): 265 | 266 | while not self.is_done: 267 | 268 | self.handle_events_human() 269 | self.snake.handle_events_human() 270 | 271 | if self.apple.eat(self.snake.x, self.snake.y, self.snake.tail): 272 | self.snake.update(True) 273 | else: 274 | self.snake.update(False) 275 | 276 | if self.snake.check_if_hit_wall(): 277 | self.game_over() 278 | 279 | if self.snake.check_if_ate_self(): 280 | self.game_over() 281 | 282 | if self.draw: 283 | self.screen.fill(BACKGROUND) 284 | self.snake.draw() 285 | self.apple.draw() 286 | pygame.display.update() 287 | 288 | time.sleep (self.fps / 1000.0); 289 | 290 | def handle_events_human(self): 291 | for event in pygame.event.get(): 292 | if event.type == pygame.QUIT: 293 | self.is_done = False 294 | pygame.quit() 295 | 296 | def countdown(self): 297 | if not self.animation: 298 | return 299 | for _ in range(3, 0, -1): 300 | self.screen.fill(BACKGROUND) 301 | self.snake.draw() 302 | self.apple.draw() 303 | text_start = pygame.font.SysFont(FONT, 80). \ 304 | render("Start in {}".format(_), True, (0, 0, 0)) 305 | self.screen.blit(text_start, 306 | (text_start.get_width() // 307 | 2, text_start.get_height() // 2)) 308 | pygame.display.flip() 309 | time.sleep(0.5) 310 | 311 | def game_over(self): 312 | self.is_done = True 313 | if not self.animation: 314 | return 315 | if self.draw: 316 | text = pygame.font.SysFont(FONT, 28).render( 317 | "Game Over!".format(self.reward), True, (0, 0, 0)) 318 | self.screen.blit(text, (320 - text.get_width() // 319 | 2, 240 - text.get_height() // 2)) 320 | pygame.display.flip() 321 | time.sleep(0.5) 322 | 323 | 324 | 325 | 326 | 327 | 328 | # if self.last_observation == None: 329 | # self.current_observation = current_obs 330 | 331 | # self.last_observation = self.current_observation 332 | # self.current_observation = current_obs 333 | 334 | # return_obs = [] 335 | 336 | # for i in self.last_observation: 337 | # return_obs.append(i) 338 | 339 | # for i in self.current_observation: 340 | # return_obs.append(i) 341 | 342 | # current_obs = np.array(current_obs) 343 | 344 | # for i in range(25): 345 | # if i%5==0: 346 | # print('') 347 | # print(' ' , self.last_observation[i] , ' ' , end='') 348 | 349 | # print('') 350 | # for i in range(25): 351 | # if i%5==0: 352 | # print('') 353 | # print(' ' ,self.current_observation[i], ' ' , end='') 354 | -------------------------------------------------------------------------------- /snake/environment/snake.py: -------------------------------------------------------------------------------- 1 | # https://www.youtube.com/watch?v=AaGK-fj-BAM&t=630s 2 | import pygame 3 | 4 | 5 | class Snake: 6 | 7 | def __init__(self, screen, s_width, s_height, color, body_color, scale): 8 | 9 | self.screen = screen 10 | self.s_width = s_width 11 | self.s_height = s_height 12 | self.color = color 13 | self.body_color = body_color 14 | self.scale = scale 15 | 16 | self.scale = scale 17 | 18 | self.x = 2 * scale 19 | self.y = 2 * scale 20 | 21 | self.x_speed = 1 22 | self.y_speed = 0 23 | 24 | self.tail = [Vector(self.x, self.y)] 25 | 26 | def handle_events_human(self): 27 | keys = pygame.key.get_pressed() 28 | if keys[pygame.K_UP]: 29 | self.move(0, -1) 30 | if keys[pygame.K_RIGHT]: 31 | self.move(1, 0) 32 | if keys[pygame.K_DOWN]: 33 | self.move(0, 1) 34 | if keys[pygame.K_LEFT]: 35 | self.move(-1, 0) 36 | 37 | def handle_events_ai(self, action): 38 | # print(action) 39 | if action == 0: 40 | self.move(0, -1) 41 | if action == 1: 42 | self.move(1, 0) 43 | if action == 2: 44 | self.move(0, 1) 45 | if action == 3: 46 | self.move(-1, 0) 47 | 48 | def draw(self): 49 | 50 | for i in self.tail: 51 | rect = pygame.rect.Rect( 52 | i.x + 1, i.y + 1, self.scale - 2, self.scale - 2) 53 | pygame.draw.rect(self.screen, self.color, rect) 54 | rect = pygame.rect.Rect( 55 | i.x + 16, i.y + 16, self.scale - 32, self.scale - 32) 56 | pygame.draw.rect(self.screen, self.body_color, rect) 57 | 58 | rect = pygame.rect.Rect( 59 | self.x, self.y, self.scale, self.scale) 60 | pygame.draw.rect(self.screen, self.color, rect) 61 | 62 | def update(self, ate_apple): 63 | 64 | length = len(self.tail) 65 | 66 | if ate_apple: 67 | self.tail.append(Vector(self.x, self.y)) 68 | else: 69 | for i in range(length - 1): 70 | self.tail[i] = self.tail[i + 1] 71 | self.tail[length - 1] = Vector(self.x, self.y) 72 | 73 | self.x = self.x + self.x_speed * self.scale 74 | self.y = self.y + self.y_speed * self.scale 75 | 76 | if self.x < 0: 77 | self.x = 0 78 | if self.x > self.s_width - self.scale: 79 | self.x = self.s_width - self.scale 80 | if self.y < 0: 81 | self.y = 0 82 | if self.y > self.s_height - self.scale: 83 | self.y = self.s_height - self.scale 84 | 85 | def move(self, x, y): 86 | self.x_speed = x 87 | self.y_speed = y 88 | 89 | def check_if_hit_wall(self): 90 | if self.x == -1: 91 | return True 92 | if self.x == self.s_width: 93 | return True 94 | if self.y == -1: 95 | return True 96 | if self.y == self.s_height: 97 | return True 98 | 99 | def check_if_ate_self(self): 100 | for i in self.tail: 101 | if (self.x == i.x) and (self.y == i.y): 102 | return True 103 | 104 | 105 | class Vector: 106 | 107 | def __init__(self, x, y): 108 | self.x = x 109 | self.y = y 110 | -------------------------------------------------------------------------------- /snake/lib/common.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | HYPERPARAMS = { 9 | 'snake': { 10 | 'stop_reward': 40.0, 11 | 'run_name': 'snake', 12 | 'replay_size': 1000000, 13 | 'replay_initial': 100000, 14 | 'target_net_sync': 1000, 15 | 'epsilon_frames': 10**6, 16 | 'epsilon_start': 1.0, 17 | 'epsilon_final': 0.02, 18 | 'learning_rate': 0.002, 19 | 'gamma': 0.9, 20 | 'batch_size': 32 21 | } 22 | } 23 | 24 | 25 | def unpack_batch(batch): 26 | states, actions, rewards, dones, last_states = [], [], [], [], [] 27 | for exp in batch: 28 | state = np.array(exp.state, copy=False) 29 | states.append(state) 30 | actions.append(exp.action) 31 | rewards.append(exp.reward) 32 | dones.append(exp.last_state is None) 33 | if exp.last_state is None: 34 | last_states.append(state) # the result will be masked anyway 35 | else: 36 | last_states.append(np.array(exp.last_state, copy=False)) 37 | return np.array(states, copy=False), np.array(actions), np.array(rewards, dtype=np.float32), \ 38 | np.array(dones, dtype=np.uint8), np.array(last_states, copy=False) 39 | 40 | 41 | def calc_loss_dqn(batch, net, tgt_net, gamma, device="cpu"): 42 | states, actions, rewards, dones, next_states = unpack_batch(batch) 43 | 44 | states_v = torch.tensor(states).to(device) 45 | next_states_v = torch.tensor(next_states).to(device) 46 | actions_v = torch.tensor(actions).to(device) 47 | rewards_v = torch.tensor(rewards).to(device) 48 | done_mask = torch.ByteTensor(dones).to(device) 49 | 50 | state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) 51 | next_state_values = tgt_net(next_states_v).max(1)[0] 52 | next_state_values[done_mask] = 0.0 53 | 54 | expected_state_action_values = next_state_values.detach() * gamma + rewards_v 55 | return nn.MSELoss()(state_action_values, expected_state_action_values) 56 | 57 | 58 | class RewardTracker: 59 | def __init__(self, net, writer, stop_reward): 60 | self.writer = writer 61 | self.stop_reward = stop_reward 62 | self.net = net 63 | self.best_reward = -1 64 | 65 | def __enter__(self): 66 | self.ts = time.time() 67 | self.ts_frame = 0 68 | self.total_rewards = [] 69 | return self 70 | 71 | def __exit__(self, *args): 72 | if self.writer != None: 73 | self.writer.close() 74 | 75 | def reward(self, reward, frame, epsilon=None): 76 | self.total_rewards.append(reward) 77 | speed = (frame - self.ts_frame) / (time.time() - self.ts) 78 | self.ts_frame = frame 79 | self.ts = time.time() 80 | mean_reward = np.mean(self.total_rewards[-100:]) 81 | epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon 82 | print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % ( 83 | frame, len(self.total_rewards), mean_reward, speed, epsilon_str 84 | )) 85 | sys.stdout.flush() 86 | if self.writer != None: 87 | if epsilon is not None: 88 | self.writer.add_scalar("epsilon", epsilon, frame) 89 | self.writer.add_scalar("speed", speed, frame) 90 | self.writer.add_scalar("reward_100", mean_reward, frame) 91 | self.writer.add_scalar("reward", reward, frame) 92 | if reward > self.best_reward: 93 | self.best_reward = reward 94 | torch.save(self.net.state_dict(), 'models/best-snake-model-' + str(int(reward))) 95 | print("\tNew best reward = ", str(reward)) 96 | if mean_reward > self.stop_reward: 97 | print("Solved in %d frames!" % frame) 98 | return True 99 | return False 100 | 101 | 102 | class EpsilonTracker: 103 | def __init__(self, epsilon_greedy_selector, params): 104 | self.epsilon_greedy_selector = epsilon_greedy_selector 105 | self.epsilon_start = params['epsilon_start'] 106 | self.epsilon_final = params['epsilon_final'] 107 | self.epsilon_frames = params['epsilon_frames'] 108 | self.frame(0) 109 | 110 | def frame(self, frame): 111 | self.epsilon_greedy_selector.epsilon = \ 112 | max(self.epsilon_final, self.epsilon_start - frame / self.epsilon_frames) 113 | 114 | 115 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma): 116 | """ 117 | Perform distribution projection aka Catergorical Algorithm from the 118 | "A Distributional Perspective on RL" paper 119 | """ 120 | batch_size = len(rewards) 121 | proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32) 122 | delta_z = (Vmax - Vmin) / (n_atoms - 1) 123 | for atom in range(n_atoms): 124 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma)) 125 | b_j = (tz_j - Vmin) / delta_z 126 | l = np.floor(b_j).astype(np.int64) 127 | u = np.ceil(b_j).astype(np.int64) 128 | eq_mask = u == l 129 | proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom] 130 | ne_mask = u != l 131 | proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask] 132 | proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask] 133 | if dones.any(): 134 | proj_distr[dones] = 0.0 135 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones])) 136 | b_j = (tz_j - Vmin) / delta_z 137 | l = np.floor(b_j).astype(np.int64) 138 | u = np.ceil(b_j).astype(np.int64) 139 | eq_mask = u == l 140 | eq_dones = dones.copy() 141 | eq_dones[dones] = eq_mask 142 | if eq_dones.any(): 143 | proj_distr[eq_dones, l[eq_mask]] = 1.0 144 | ne_mask = u != l 145 | ne_dones = dones.copy() 146 | ne_dones[dones] = ne_mask 147 | if ne_dones.any(): 148 | proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask] 149 | proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask] 150 | return proj_distr -------------------------------------------------------------------------------- /snake/lib/dqn_model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | import numpy as np 7 | 8 | 9 | class NoisyLinear(nn.Linear): 10 | def __init__(self, in_features, out_features, sigma_init=0.017, bias=True): 11 | super(NoisyLinear, self).__init__(in_features, out_features, bias=bias) 12 | self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) 13 | self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features)) 14 | if bias: 15 | self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init)) 16 | self.register_buffer("epsilon_bias", torch.zeros(out_features)) 17 | self.reset_parameters() 18 | 19 | def reset_parameters(self): 20 | std = math.sqrt(3 / self.in_features) 21 | self.weight.data.uniform_(-std, std) 22 | self.bias.data.uniform_(-std, std) 23 | 24 | def forward(self, input): 25 | self.epsilon_weight.normal_() 26 | bias = self.bias 27 | if bias is not None: 28 | self.epsilon_bias.normal_() 29 | bias = bias + self.sigma_bias * self.epsilon_bias.data 30 | return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias) 31 | 32 | 33 | class NoisyFactorizedLinear(nn.Linear): 34 | """ 35 | NoisyNet layer with factorized gaussian noise 36 | 37 | N.B. nn.Linear already initializes weight and bias to 38 | """ 39 | def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True): 40 | super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias) 41 | sigma_init = sigma_zero / math.sqrt(in_features) 42 | self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) 43 | self.register_buffer("epsilon_input", torch.zeros(1, in_features)) 44 | self.register_buffer("epsilon_output", torch.zeros(out_features, 1)) 45 | if bias: 46 | self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init)) 47 | 48 | def forward(self, input): 49 | self.epsilon_input.normal_() 50 | self.epsilon_output.normal_() 51 | 52 | func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x)) 53 | eps_in = func(self.epsilon_input.data) 54 | eps_out = func(self.epsilon_output.data) 55 | 56 | bias = self.bias 57 | if bias is not None: 58 | bias = bias + self.sigma_bias * eps_out.t() 59 | noise_v = torch.mul(eps_in, eps_out) 60 | return F.linear(input, self.weight + self.sigma_weight * noise_v, bias) 61 | 62 | 63 | class DQN(nn.Module): 64 | def __init__(self, input_shape, n_actions): 65 | super(DQN, self).__init__() 66 | 67 | self.fc = nn.Sequential( 68 | nn.Linear(input_shape, 512), 69 | nn.ReLU(), 70 | nn.Linear(512, n_actions) 71 | ) 72 | 73 | def forward(self, x): 74 | fx = x.float() / 256 75 | return self.fc(fx) -------------------------------------------------------------------------------- /snake/lib/dqn_rainbow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from environment.environment import Environment 3 | import ptan 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | import torch.optim as optim 12 | 13 | from tensorboardX import SummaryWriter 14 | 15 | import dqn as dqn_model 16 | import common 17 | 18 | # n-step 19 | REWARD_STEPS = 2 20 | 21 | # priority replay 22 | PRIO_REPLAY_ALPHA = 0.6 23 | BETA_START = 0.4 24 | BETA_FRAMES = 100000 25 | 26 | # C51 27 | Vmax = 10 28 | Vmin = -10 29 | N_ATOMS = 51 30 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 31 | 32 | 33 | class RainbowDQN(nn.Module): 34 | def __init__(self, input_shape, n_actions): 35 | super(RainbowDQN, self).__init__() 36 | 37 | self.fc_val = nn.Sequential( 38 | dqn_model.NoisyLinear(input_shape[0], 256), 39 | nn.ReLU(), 40 | dqn_model.NoisyLinear(256, N_ATOMS) 41 | ) 42 | 43 | self.fc_adv = nn.Sequential( 44 | dqn_model.NoisyLinear(input_shape[0], 256), 45 | nn.ReLU(), 46 | dqn_model.NoisyLinear(256, n_actions * N_ATOMS) 47 | ) 48 | 49 | self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)) 50 | self.softmax = nn.Softmax(dim=1) 51 | 52 | def forward(self, x): 53 | batch_size = x.size()[0] 54 | fx = x.float() / 256 55 | val_out = self.fc_val(fx).view(batch_size, 1, N_ATOMS) 56 | adv_out = self.fc_adv(fx).view(batch_size, -1, N_ATOMS) 57 | adv_mean = adv_out.mean(dim=1, keepdim=True) 58 | return val_out + (adv_out - adv_mean) 59 | 60 | def both(self, x): 61 | cat_out = self(x) 62 | probs = self.apply_softmax(cat_out) 63 | weights = probs * self.supports 64 | res = weights.sum(dim=2) 65 | return cat_out, res 66 | 67 | def qvals(self, x): 68 | return self.both(x)[1] 69 | 70 | def apply_softmax(self, t): 71 | return self.softmax(t.view(-1, N_ATOMS)).view(t.size()) 72 | 73 | 74 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): 75 | states, actions, rewards, dones, next_states = common.unpack_batch(batch) 76 | batch_size = len(batch) 77 | 78 | states_v = torch.tensor(states).to(device) 79 | actions_v = torch.tensor(actions).to(device) 80 | next_states_v = torch.tensor(next_states).to(device) 81 | batch_weights_v = torch.tensor(batch_weights).to(device) 82 | 83 | # next state distribution 84 | # dueling arch -- actions from main net, distr from tgt_net 85 | 86 | # calc at once both next and cur states 87 | distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) 88 | next_qvals_v = qvals_v[batch_size:] 89 | distr_v = distr_v[:batch_size] 90 | 91 | next_actions_v = next_qvals_v.max(1)[1] 92 | next_distr_v = tgt_net(next_states_v) 93 | next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] 94 | next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) 95 | next_best_distr = next_best_distr_v.data.cpu().numpy() 96 | 97 | dones = dones.astype(np.bool) 98 | 99 | # project our distribution using Bellman update 100 | proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) 101 | 102 | # calculate net output 103 | state_action_values = distr_v[range(batch_size), actions_v.data] 104 | state_log_sm_v = F.log_softmax(state_action_values, dim=1) 105 | proj_distr_v = torch.tensor(proj_distr).to(device) 106 | 107 | loss_v = -state_log_sm_v * proj_distr_v 108 | loss_v = batch_weights_v * loss_v.sum(dim=1) 109 | return loss_v.mean(), loss_v + 1e-5 110 | 111 | 112 | if __name__ == "__main__": 113 | params = common.HYPERPARAMS['pong'] 114 | params['epsilon_frames'] *= 2 115 | parser = argparse.ArgumentParser() 116 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 117 | args = parser.parse_args() 118 | device = torch.device("cuda" if args.cuda else "cpu") 119 | 120 | env = gym.make(params['env_name']) 121 | env = ptan.common.wrappers.wrap_dqn(env) 122 | 123 | writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow") 124 | net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device) 125 | tgt_net = ptan.agent.TargetNet(net) 126 | agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device) 127 | 128 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS) 129 | buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) 130 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 131 | 132 | frame_idx = 0 133 | beta = BETA_START 134 | 135 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 136 | while True: 137 | frame_idx += 1 138 | buffer.populate(1) 139 | beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) 140 | 141 | new_rewards = exp_source.pop_total_rewards() 142 | if new_rewards: 143 | if reward_tracker.reward(new_rewards[0], frame_idx): 144 | break 145 | 146 | if len(buffer) < params['replay_initial']: 147 | continue 148 | 149 | optimizer.zero_grad() 150 | batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta) 151 | loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model, 152 | params['gamma'] ** REWARD_STEPS, device=device) 153 | loss_v.backward() 154 | optimizer.step() 155 | buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy()) 156 | 157 | if frame_idx % params['target_net_sync'] == 0: 158 | tgt_net.sync() 159 | -------------------------------------------------------------------------------- /snake/lib/ppo_model.py: -------------------------------------------------------------------------------- 1 | import ptan 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | 6 | HID_SIZE = 1024 7 | 8 | 9 | class ModelActor(nn.Module): 10 | def __init__(self, obs_size, act_size): 11 | super(ModelActor, self).__init__() 12 | 13 | self.mu = nn.Sequential( 14 | nn.Linear(obs_size, HID_SIZE), 15 | nn.Tanh(), 16 | nn.Linear(HID_SIZE, HID_SIZE), 17 | nn.Tanh(), 18 | nn.Linear(HID_SIZE, act_size), 19 | nn.Tanh(), 20 | ) 21 | self.logstd = nn.Parameter(torch.zeros(act_size)) 22 | 23 | def forward(self, x): 24 | return self.mu(x) 25 | 26 | 27 | class ModelCritic(nn.Module): 28 | def __init__(self, obs_size): 29 | super(ModelCritic, self).__init__() 30 | 31 | self.value = nn.Sequential( 32 | nn.Linear(obs_size, HID_SIZE), 33 | nn.ReLU(), 34 | nn.Linear(HID_SIZE, HID_SIZE), 35 | nn.ReLU(), 36 | nn.Linear(HID_SIZE, 1), 37 | ) 38 | 39 | def forward(self, x): 40 | return self.value(x) 41 | 42 | 43 | class AgentA2C(ptan.agent.BaseAgent): 44 | def __init__(self, net, device="cpu"): 45 | self.net = net 46 | self.device = device 47 | 48 | def __call__(self, states, agent_states): 49 | states_v = ptan.agent.float32_preprocessor(states).to(self.device) 50 | 51 | mu_v = self.net(states_v) 52 | mu = mu_v.data.cpu().numpy() 53 | logstd = self.net.logstd.data.cpu().numpy() 54 | actions = mu + np.exp(logstd) * np.random.normal(size=logstd.shape) 55 | actions = np.clip(actions, -1, 1) 56 | return actions, agent_states 57 | -------------------------------------------------------------------------------- /snake/play_ppo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import ptan 4 | 5 | from lib import ppo_model as model 6 | from environment.environment import SnakeEnvironment 7 | 8 | MODEL_NAME = "best_+1.000_153000.dat" 9 | 10 | env = SnakeEnvironment(draw=True, speed=15, rows=5, animation=True) 11 | 12 | net_act = model.ModelActor(env.observation_space.n, 13 | env.action_space.n).to("cpu") 14 | net_act.load_state_dict(torch.load("saves/ppo-test-snake/" + MODEL_NAME, map_location=lambda storage, loc: storage)) 15 | 16 | rewards = 0.0 17 | steps = 0 18 | for _ in range(5): 19 | obs = env.reset() 20 | while True: 21 | obs_v = ptan.agent.float32_preprocessor([obs]).to("cpu") 22 | mu_v = net_act(obs_v)[0] 23 | action = mu_v.squeeze(dim=0).data.cpu().numpy() 24 | action = np.clip(action, -1, 1) 25 | obs, reward, done, _ = env.step(action) 26 | rewards += reward 27 | steps += 1 28 | if done: 29 | break 30 | -------------------------------------------------------------------------------- /snake/ppo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import math 4 | import ptan 5 | import time 6 | import gym 7 | import argparse 8 | from tensorboardX import SummaryWriter 9 | 10 | from lib import ppo_model as model 11 | from environment.environment import SnakeEnvironment 12 | 13 | import numpy as np 14 | import torch 15 | import torch.optim as optim 16 | import torch.nn.functional as F 17 | 18 | ENV = SnakeEnvironment(draw=True, fps=100, debug=False, animation=False) 19 | TEST_ENV = SnakeEnvironment( draw=False, fps=100, debug=False, animation=False) 20 | 21 | GAMMA = 0.99 22 | GAE_LAMBDA = 0.95 23 | 24 | TRAJECTORY_SIZE = 2049 25 | LEARNING_RATE_ACTOR = 1e-3 26 | LEARNING_RATE_CRITIC = 1e-2 27 | 28 | PPO_EPS = 0.2 29 | PPO_EPOCHES = 10 30 | PPO_BATCH_SIZE = 64 31 | 32 | TEST_ITERS = 1000 33 | 34 | 35 | def test_net(net, env, count=10, device="cpu"): 36 | rewards = 0.0 37 | steps = 0 38 | for _ in range(count): 39 | obs = env.reset() 40 | while True: 41 | obs_v = ptan.agent.float32_preprocessor([obs]).to(device) 42 | mu_v = net(obs_v)[0] 43 | action = mu_v.squeeze(dim=0).data.cpu().numpy() 44 | action = np.clip(action, -1, 1) 45 | obs, reward, done, _ = env.step(action) 46 | rewards += reward 47 | steps += 1 48 | if done: 49 | break 50 | return rewards / count, steps / count 51 | 52 | 53 | def calc_logprob(mu_v, logstd_v, actions_v): 54 | p1 = - ((mu_v - actions_v) ** 2) / (2*torch.exp(logstd_v).clamp(min=1e-3)) 55 | p2 = - torch.log(torch.sqrt(2 * math.pi * torch.exp(logstd_v))) 56 | return p1 + p2 57 | 58 | 59 | def calc_adv_ref(trajectory, net_crt, states_v, device="cpu"): 60 | """ 61 | By trajectory calculate advantage and 1-step ref value 62 | :param trajectory: trajectory list 63 | :param net_crt: critic network 64 | :param states_v: states tensor 65 | :return: tuple with advantage numpy array and reference values 66 | """ 67 | values_v = net_crt(states_v) 68 | values = values_v.squeeze().data.cpu().numpy() 69 | # generalized advantage estimator: smoothed version of the advantage 70 | last_gae = 0.0 71 | result_adv = [] 72 | result_ref = [] 73 | for val, next_val, (exp,) in zip(reversed(values[:-1]), reversed(values[1:]), 74 | reversed(trajectory[:-1])): 75 | if exp.done: 76 | delta = exp.reward - val 77 | last_gae = delta 78 | else: 79 | delta = exp.reward + GAMMA * next_val - val 80 | last_gae = delta + GAMMA * GAE_LAMBDA * last_gae 81 | result_adv.append(last_gae) 82 | result_ref.append(last_gae + val) 83 | 84 | adv_v = torch.FloatTensor(list(reversed(result_adv))).to(device) 85 | ref_v = torch.FloatTensor(list(reversed(result_ref))).to(device) 86 | return adv_v, ref_v 87 | 88 | 89 | if __name__ == "__main__": 90 | # parser = argparse.ArgumentParser() 91 | # parser.add_argument("--cuda", default=False, action='store_true', help='Enable CUDA') 92 | # parser.add_argument("-n", "--name", required=True, help="Name of the run") 93 | # parser.add_argument("-e", "--env", default=ENV_ID, help="Environment id, default=" + ENV_ID) 94 | # args = parser.parse_args() 95 | 96 | name = "test-snake" 97 | 98 | # device = torch.device("cuda" if args.cuda else "cpu") 99 | device = torch.device("cpu") 100 | 101 | save_path = os.path.join("saves", "ppo-" + name) 102 | os.makedirs(save_path, exist_ok=True) 103 | 104 | env = ENV 105 | test_env = TEST_ENV 106 | 107 | net_act = model.ModelActor(env.observation_space.n, env.action_space.n).to(device) 108 | net_crt = model.ModelCritic(env.observation_space.n).to(device) 109 | print(net_act) 110 | print(net_crt) 111 | 112 | writer = SummaryWriter(comment="-ppo_" + name) 113 | agent = model.AgentA2C(net_act, device=device) 114 | exp_source = ptan.experience.ExperienceSource(env, agent, steps_count=1) 115 | 116 | opt_act = optim.Adam(net_act.parameters(), lr=LEARNING_RATE_ACTOR) 117 | opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC) 118 | 119 | trajectory = [] 120 | best_reward = None 121 | with ptan.common.utils.RewardTracker(writer) as tracker: 122 | for step_idx, exp in enumerate(exp_source): 123 | rewards_steps = exp_source.pop_rewards_steps() 124 | if rewards_steps: 125 | rewards, steps = zip(*rewards_steps) 126 | writer.add_scalar("episode_steps", np.mean(steps), step_idx) 127 | tracker.reward(np.mean(rewards), step_idx) 128 | 129 | if step_idx % TEST_ITERS == 0: 130 | ts = time.time() 131 | rewards, steps = test_net(net_act, test_env, device=device) 132 | print("Test done in %.2f sec, reward %.3f, steps %d" % ( 133 | time.time() - ts, rewards, steps)) 134 | writer.add_scalar("test_reward", rewards, step_idx) 135 | writer.add_scalar("test_steps", steps, step_idx) 136 | if best_reward is None or best_reward < rewards: 137 | if best_reward is not None: 138 | print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) 139 | name = "best_%+.3f_%d.dat" % (rewards, step_idx) 140 | fname = os.path.join(save_path, name) 141 | torch.save(net_act.state_dict(), fname) 142 | best_reward = rewards 143 | 144 | trajectory.append(exp) 145 | if len(trajectory) < TRAJECTORY_SIZE: 146 | continue 147 | 148 | traj_states = [t[0].state for t in trajectory] 149 | traj_actions = [t[0].action for t in trajectory] 150 | traj_states_v = torch.FloatTensor(traj_states).to(device) 151 | traj_actions_v = torch.FloatTensor(traj_actions).to(device) 152 | traj_adv_v, traj_ref_v = calc_adv_ref(trajectory, net_crt, traj_states_v, device=device) 153 | mu_v = net_act(traj_states_v) 154 | old_logprob_v = calc_logprob(mu_v, net_act.logstd, traj_actions_v) 155 | 156 | # normalize advantages 157 | traj_adv_v = (traj_adv_v - torch.mean(traj_adv_v)) / torch.std(traj_adv_v) 158 | 159 | # drop last entry from the trajectory, an our adv and ref value calculated without it 160 | trajectory = trajectory[:-1] 161 | old_logprob_v = old_logprob_v[:-1].detach() 162 | 163 | sum_loss_value = 0.0 164 | sum_loss_policy = 0.0 165 | count_steps = 0 166 | 167 | for epoch in range(PPO_EPOCHES): 168 | for batch_ofs in range(0, len(trajectory), PPO_BATCH_SIZE): 169 | states_v = traj_states_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE] 170 | actions_v = traj_actions_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE] 171 | batch_adv_v = traj_adv_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE].unsqueeze(-1) 172 | batch_ref_v = traj_ref_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE] 173 | batch_old_logprob_v = old_logprob_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE] 174 | 175 | # critic training 176 | opt_crt.zero_grad() 177 | value_v = net_crt(states_v) 178 | loss_value_v = F.mse_loss(value_v.squeeze(-1), batch_ref_v) 179 | loss_value_v.backward() 180 | opt_crt.step() 181 | 182 | # actor training 183 | opt_act.zero_grad() 184 | mu_v = net_act(states_v) 185 | logprob_pi_v = calc_logprob(mu_v, net_act.logstd, actions_v) 186 | ratio_v = torch.exp(logprob_pi_v - batch_old_logprob_v) 187 | surr_obj_v = batch_adv_v * ratio_v 188 | clipped_surr_v = batch_adv_v * torch.clamp(ratio_v, 1.0 - PPO_EPS, 1.0 + PPO_EPS) 189 | loss_policy_v = -torch.min(surr_obj_v, clipped_surr_v).mean() 190 | loss_policy_v.backward() 191 | opt_act.step() 192 | 193 | sum_loss_value += loss_value_v.item() 194 | sum_loss_policy += loss_policy_v.item() 195 | count_steps += 1 196 | 197 | trajectory.clear() 198 | writer.add_scalar("advantage", traj_adv_v.mean().item(), step_idx) 199 | writer.add_scalar("values", traj_ref_v.mean().item(), step_idx) 200 | writer.add_scalar("loss_policy", sum_loss_policy / count_steps, step_idx) 201 | writer.add_scalar("loss_value", sum_loss_value / count_steps, step_idx) 202 | -------------------------------------------------------------------------------- /snake/self_play.py: -------------------------------------------------------------------------------- 1 | from environment.environment import SnakeEnvironment 2 | import random 3 | 4 | env = SnakeEnvironment(draw=True, speed=100, rows=5, animation=False) 5 | 6 | # env.play_human() 7 | 8 | while True: 9 | env.reset() 10 | terminal = False 11 | while not terminal: 12 | action = random.randint(0, 4) 13 | next_state, reward, is_done, _ = env.step(action) 14 | terminal = is_done 15 | --------------------------------------------------------------------------------