├── .gitignore
├── README.md
├── better_snake
    ├── models
    │   └── snake-basterd.zip
    ├── old_env
    │   ├── apple.py
    │   ├── environment.py
    │   └── snake.py
    ├── old_env_2
    │   ├── cube.py
    │   ├── environment.py
    │   ├── self_play.py
    │   └── snake.py
    └── ppo.py
├── flappyb
    ├── dqn_rainbow.py
    ├── dqn_v2.py
    ├── dqn_v3.py
    ├── environment
    │   ├── assets
    │   │   ├── Pong-653x400.png
    │   │   ├── all_fonts_script.py
    │   │   ├── bg.png
    │   │   ├── bird.png
    │   │   ├── pipe.png
    │   │   ├── pipe_long.png
    │   │   └── sapcraft.jpg
    │   ├── bird.py
    │   ├── environment.py
    │   └── pipe.py
    ├── lib
    │   ├── common.py
    │   ├── dqn_model.py
    │   ├── dqn_rainbow.py
    │   └── ppo_model.py
    ├── models
    │   ├── cross_entropy
    │   │   └── batchsize=100-hiddensize=256-lr=0.01-gamma=.9-PART=240.pt
    │   ├── dqn
    │   │   ├── dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-LOADED=HARDCORE-6300-lrMax=0.4-nextPipe-HELL-PART=1000.h5
    │   │   ├── dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=6650.h5
    │   │   └── dqn-expdecay=0.999995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-HARDCORE-PART=6300.h5
    │   ├── flappyb-test-the-rainbow254
    │   ├── flappyb-test-the-rainbow350
    │   └── flappyb-test-the-rainbow87
    ├── play_dqn_rainbow.py
    ├── play_ppo.py
    ├── play_self.py
    ├── ppo.py
    └── saves
    │   └── ppo-test-flappyb
    │       ├── best_+10.400_555000.dat
    │       ├── best_+11.270_556000.dat
    │       ├── best_+131.310_576000.dat
    │       ├── best_+20.470_558000.dat
    │       ├── best_+4.650_165000.dat
    │       ├── best_+4.860_370000.dat
    │       ├── best_+44.070_560000.dat
    │       ├── best_+44.560_561000.dat
    │       ├── best_+5.290_475000.dat
    │       ├── best_+5.530_495000.dat
    │       ├── best_+5.740_516000.dat
    │       ├── best_+5.820_538000.dat
    │       ├── best_+56.790_570000.dat
    │       ├── best_+6.250_539000.dat
    │       ├── best_+6.820_542000.dat
    │       ├── best_+7.200_547000.dat
    │       └── best_+8.690_550000.dat
├── old_agents
    ├── cross_entropy.py
    ├── cross_entropy_advanced.py
    ├── dqn_snake_v2.py
    ├── q_iteration.py
    ├── q_learning.py
    └── value_iteration.py
├── requirements.txt
├── runTensorBoard
└── snake
    ├── base_ppo.py
    ├── env_new
        ├── cube.py
        ├── environment.py
        ├── self_play.py
        └── snake.py
    ├── environment
        ├── apple.py
        ├── environment.py
        └── snake.py
    ├── lib
        ├── common.py
        ├── dqn_model.py
        ├── dqn_rainbow.py
        └── ppo_model.py
    ├── play_ppo.py
    ├── ppo.py
    └── self_play.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.gitignore.io/api/python,pycharm,sublimetext
  2 | # Edit at https://www.gitignore.io/?templates=python,pycharm,sublimetext
  3 | 
  4 | ### PyCharm ###
  5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
  6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  7 | 
  8 | # User-specific stuff
  9 | .idea/**/workspace.xml
 10 | .idea/**/tasks.xml
 11 | .idea/**/usage.statistics.xml
 12 | .idea/**/dictionaries
 13 | .idea/**/shelf
 14 | 
 15 | # Generated files
 16 | .idea/**/contentModel.xml
 17 | 
 18 | # Sensitive or high-churn files
 19 | .idea/**/dataSources/
 20 | .idea/**/dataSources.ids
 21 | .idea/**/dataSources.local.xml
 22 | .idea/**/sqlDataSources.xml
 23 | .idea/**/dynamic.xml
 24 | .idea/**/uiDesigner.xml
 25 | .idea/**/dbnavigator.xml
 26 | 
 27 | # Gradle
 28 | .idea/**/gradle.xml
 29 | .idea/**/libraries
 30 | 
 31 | # Gradle and Maven with auto-import
 32 | # When using Gradle or Maven with auto-import, you should exclude module files,
 33 | # since they will be recreated, and may cause churn.  Uncomment if using
 34 | # auto-import.
 35 | # .idea/modules.xml
 36 | # .idea/*.iml
 37 | # .idea/modules
 38 | 
 39 | # CMake
 40 | cmake-build-*/
 41 | 
 42 | # Mongo Explorer plugin
 43 | .idea/**/mongoSettings.xml
 44 | 
 45 | # File-based project format
 46 | *.iws
 47 | 
 48 | # IntelliJ
 49 | out/
 50 | 
 51 | # mpeltonen/sbt-idea plugin
 52 | .idea_modules/
 53 | 
 54 | # JIRA plugin
 55 | atlassian-ide-plugin.xml
 56 | 
 57 | # Cursive Clojure plugin
 58 | .idea/replstate.xml
 59 | 
 60 | # Crashlytics plugin (for Android Studio and IntelliJ)
 61 | com_crashlytics_export_strings.xml
 62 | crashlytics.properties
 63 | crashlytics-build.properties
 64 | fabric.properties
 65 | 
 66 | # Editor-based Rest Client
 67 | .idea/httpRequests
 68 | 
 69 | # Android studio 3.1+ serialized cache file
 70 | .idea/caches/build_file_checksums.ser
 71 | 
 72 | # JetBrains templates
 73 | **___jb_tmp___
 74 | 
 75 | ### PyCharm Patch ###
 76 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 77 | 
 78 | # *.iml
 79 | # modules.xml
 80 | # .idea/misc.xml
 81 | # *.ipr
 82 | 
 83 | # Sonarlint plugin
 84 | .idea/sonarlint
 85 | 
 86 | ### Python ###
 87 | # Byte-compiled / optimized / DLL files
 88 | __pycache__/
 89 | *.py[cod]
 90 | *$py.class
 91 | 
 92 | # C extensions
 93 | *.so
 94 | 
 95 | # Distribution / packaging
 96 | .Python
 97 | build/
 98 | develop-eggs/
 99 | dist/
100 | downloads/
101 | eggs/
102 | .eggs/
103 | lib64/
104 | parts/
105 | sdist/
106 | var/
107 | wheels/
108 | pip-wheel-metadata/
109 | share/python-wheels/
110 | *.egg-info/
111 | .installed.cfg
112 | *.egg
113 | MANIFEST
114 | 
115 | # PyInstaller
116 | #  Usually these files are written by a python script from a template
117 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
118 | *.manifest
119 | *.spec
120 | 
121 | # Installer logs
122 | pip-log.txt
123 | pip-delete-this-directory.txt
124 | 
125 | # Unit test / coverage reports
126 | htmlcov/
127 | .tox/
128 | .nox/
129 | .coverage
130 | .coverage.*
131 | .cache
132 | nosetests.xml
133 | coverage.xml
134 | *.cover
135 | .hypothesis/
136 | .pytest_cache/
137 | 
138 | # Translations
139 | *.mo
140 | *.pot
141 | 
142 | # Django stuff:
143 | *.log
144 | local_settings.py
145 | db.sqlite3
146 | 
147 | # Flask stuff:
148 | instance/
149 | .webassets-cache
150 | 
151 | # Scrapy stuff:
152 | .scrapy
153 | 
154 | # Sphinx documentation
155 | docs/_build/
156 | 
157 | # PyBuilder
158 | target/
159 | 
160 | # Jupyter Notebook
161 | .ipynb_checkpoints
162 | 
163 | # IPython
164 | profile_default/
165 | ipython_config.py
166 | 
167 | # pyenv
168 | .python-version
169 | 
170 | # pipenv
171 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
172 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
173 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
174 | #   install all needed dependencies.
175 | #Pipfile.lock
176 | 
177 | # celery beat schedule file
178 | celerybeat-schedule
179 | 
180 | # SageMath parsed files
181 | *.sage.py
182 | 
183 | # Environments
184 | .env
185 | .venv
186 | env/
187 | venv/
188 | ENV/
189 | env.bak/
190 | venv.bak/
191 | 
192 | # Spyder project settings
193 | .spyderproject
194 | .spyproject
195 | 
196 | # Rope project settings
197 | .ropeproject
198 | 
199 | # mkdocs documentation
200 | /site
201 | 
202 | # mypy
203 | .mypy_cache/
204 | .dmypy.json
205 | dmypy.json
206 | 
207 | # Pyre type checker
208 | .pyre/
209 | 
210 | ### SublimeText ###
211 | # Cache files for Sublime Text
212 | *.tmlanguage.cache
213 | *.tmPreferences.cache
214 | *.stTheme.cache
215 | 
216 | # Workspace files are user-specific
217 | *.sublime-workspace
218 | 
219 | # Project files should be checked into the repository, unless a significant
220 | # proportion of contributors will probably not be using Sublime Text
221 | # *.sublime-project
222 | 
223 | # SFTP configuration file
224 | sftp-config.json
225 | 
226 | # Package control specific files
227 | Package Control.last-run
228 | Package Control.ca-list
229 | Package Control.ca-bundle
230 | Package Control.system-ca-bundle
231 | Package Control.cache/
232 | Package Control.ca-certs/
233 | Package Control.merged-ca-bundle
234 | Package Control.user-ca-bundle
235 | oscrypto-ca-bundle.crt
236 | bh_unicode_properties.cache
237 | 
238 | # Sublime-github package stores a github token in this file
239 | # https://packagecontrol.io/packages/sublime-github
240 | GitHub.sublime-settings
241 | 
242 | # End of https://www.gitignore.io/api/python,pycharm,sublimetext
243 | 
244 | # Custom
245 | .runs/**
246 | .idea/**
247 | runs/
248 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement Learning on games
 2 | 
 3 | The algorithms were implemented using the book: "Deep Reinforcement Learning Hands-On" written by Maxim Lapan.
 4 | He provides a github repo with multiple implementations, that can be found in here:
 5 | https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On
 6 | 
 7 | ## Project structure
 8 | 
 9 | The root folder consits of:
10 | 1. Different games, every game has a RL algorithm, models and graphs
11 | 2. requirements.txt (probably not up to date)
12 | 3. runTensorBoard [dir], runs tensorboard on a choosen directory
13 | 4. old_agents, implementations of weaker RL algorithms
14 | 
15 | When you want to try out trained model, you have to set the LEARN flag in the agent file to false.
16 | Different models are trained on different observations, so not every combination will work. 
17 | But the models name indicates the settings for the parameters.
18 | 
19 | ## Current Algorithms
20 | 
21 | * DQN, a simple dqn implementation that offers experience replay. This is currently the best algorithm in this repository.
22 | * PPO, what everyone currently uses.
23 | * Value iteration, a good starting point.
24 | * cross_entropy, another starting point.
25 | * others ...
26 | 


--------------------------------------------------------------------------------
/better_snake/models/snake-basterd.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/better_snake/models/snake-basterd.zip


--------------------------------------------------------------------------------
/better_snake/old_env/apple.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # https://www.youtube.com/watch?v=AaGK-fj-BAM&t=630s
 3 | import pygame
 4 | import random
 5 | 
 6 | 
 7 | class Apple:
 8 | 
 9 |     def __init__(self, screen, s_width, s_height, color, scale):
10 | 
11 |         self.screen = screen
12 |         self.s_width = s_width
13 |         self.s_height = s_height
14 |         self.color = color
15 |         self.scale = scale
16 | 
17 |         self.place_apple(None)
18 | 
19 |     def draw(self):
20 |         rect = pygame.rect.Rect(self.x, self.y, self.scale, self.scale)
21 |         pygame.draw.rect(self.screen, self.color, rect)
22 | 
23 |     def eat(self, snake_x, snake_y, tail):
24 |         if self.x == snake_x and self.y == snake_y:
25 |             self.place_apple(tail)
26 |             return True
27 |         return False
28 | 
29 |     def place_apple(self, tail):
30 | 
31 |         cols = (self.s_width - self.scale) / self.scale
32 |         rows = (self.s_height - self.scale) / self.scale
33 | 
34 |         rand_x = 0
35 |         rand_y = 0
36 | 
37 |         bad_position = True
38 | 
39 |         if tail is None:
40 |             bad_position = False
41 |             rand_x = random.randint(0, cols)
42 |             rand_y = random.randint(0, rows)
43 | 
44 |         while bad_position:
45 |             bad_position = False
46 | 
47 |             rand_x = random.randint(0, cols)
48 |             rand_y = random.randint(0, rows)
49 | 
50 |             for i in tail:
51 |                 if rand_x == int(i.x / self.scale) and rand_y == int(i.y / self.scale):
52 |                     bad_position = True
53 |                     break
54 | 
55 |         self.x = rand_x * self.scale
56 |         self.y = rand_y * self.scale
57 | 


--------------------------------------------------------------------------------
/better_snake/old_env/environment.py:
--------------------------------------------------------------------------------
  1 | # Game was made with the help of https://www.youtube.com/watch?v=cXgA1d_E-jY
  2 | import gym
  3 | import gym.spaces
  4 | import time
  5 | import pygame
  6 | import random
  7 | import enum
  8 | 
  9 | import numpy as np
 10 | 
 11 | from snakeenv.snake import Snake
 12 | from snakeenv.apple import Apple
 13 | 
 14 | # AI PARAMETERS ###############################################################
 15 | BUFFER_SIZE = 2
 16 | OBSERVATION_SIZE = 10 * 10
 17 | ACTIONS = [0, 1, 2, 3]
 18 | ACTION_SIZE = 4
 19 | 
 20 | # GAME PARAMETERS #############################################################
 21 | SCALE = 60
 22 | SCREEN_SIZE = WIDTH, HEIGHT = (600, 600)    # for 5*5 go 300*300, 60
 23 |                                             # for 10*10 go 600*600, 60
 24 | BACKGROUND = (72, 72, 72)
 25 | SNAKE_COLOR = (57, 255, 20)
 26 | APPPLE_COLOR = (255, 8, 0)
 27 | FONT = 'dyuthi'
 28 | 
 29 | """ Rewards
 30 |     1. first apple +1
 31 |     2. every next apple n+1
 32 |     3. hit wall -1
 33 |     4. ate self -2
 34 |     5. does nothing 0.1
 35 | """
 36 | """ Observations
 37 |     1. apple +1
 38 |     3. snake head = 0.5
 39 |     4. every snake body -0.01
 40 |     5. emtpy cell = -1
 41 | """
 42 | """
 43 | Interace:
 44 | reset():                resets the whole environment
 45 | step(action):           performs one action onto the environment
 46 | step_buffer(action):    performs one action onto the environment,
 47 |                         returns 4 states for experience replay
 48 | get_action_random():    obtain an imporoved random action
 49 | get_observation_size(): obtain size of observation
 50 | get_action_size():      obtain size of action
 51 | """
 52 | 
 53 | 
 54 | class Actions(enum.Enum):
 55 |     Up = 0
 56 |     Right = 1
 57 |     Down = 2
 58 |     Left = 3
 59 | 
 60 | 
 61 | class SnakeEnvironment(gym.Env):
 62 | 
 63 |     def __init__(self, draw=True, fps=100, debug=False, animation=False):
 64 | 
 65 |         super(SnakeEnvironment, self).__init__()
 66 |         self.observation_space = gym.spaces.Discrete(n=OBSERVATION_SIZE*BUFFER_SIZE)
 67 |         self.action_space = gym.spaces.Discrete(n=len(Actions))
 68 | 
 69 |         if draw:
 70 |             pygame.init()
 71 |             pygame.display.set_caption('NN Snake')
 72 |             self.font_game_over = pygame.font.SysFont("ani", 72)
 73 | 
 74 |         self.draw = draw
 75 |         self.fps = fps
 76 |         self.debug = debug
 77 |         self.animation = animation
 78 |         self.screen = pygame.display.set_mode(SCREEN_SIZE)
 79 | 
 80 |         self.reward = 0
 81 |         self.score = 0
 82 |         self.is_done = False
 83 |         self.steps_without_apple = 0
 84 | 
 85 |         self.current_observation = None
 86 |         self.last_observation = None
 87 | 
 88 |     # ML INTERFACE ############################################################
 89 |     def reset(self):
 90 |         """ Resets the whole environment. Must be called in the beginning. """
 91 | 
 92 |         self.snake = Snake(self.screen, WIDTH, HEIGHT, SNAKE_COLOR,
 93 |                            BACKGROUND, SCALE)
 94 |         self.apple = Apple(self.screen, WIDTH, HEIGHT, APPPLE_COLOR, SCALE)
 95 | 
 96 |         self.reward = 0
 97 |         self.score = 0
 98 |         self.is_done = False
 99 |         self.steps_without_apple = 0
100 | 
101 |         self.current_observation = None
102 |         self.last_observation = None
103 | 
104 |         obs, reward, is_done, _ = self.step(1)
105 | 
106 |         if self.draw:
107 |             self.countdown()
108 | 
109 |         return obs
110 | 
111 |     # The actual game step ####################################################
112 |     def step(self, action):
113 | 
114 |         if isinstance(action, np.ndarray):
115 |             idx = -1
116 |             highest_idx = 0
117 |             highest_val = -1
118 |             for i in action:
119 |                 idx += 1
120 |                 if i > highest_val:
121 |                     highest_idx = idx
122 |                     highest_val = i
123 |             action = highest_idx
124 | 
125 |         current_reward = 0
126 | 
127 |         self.snake.handle_events_ai(action)
128 | 
129 |         if self.apple.eat(self.snake.x, self.snake.y, self.snake.tail):
130 |             self.snake.update(True)
131 |             self.steps_without_apple = 0
132 |             self.score += 1
133 |             current_reward = 1
134 |             # if self.score == 10:
135 |             #     current_reward = 1
136 |             # else:
137 |             #     current_reward = self.score / 10
138 |         else:
139 |             self.snake.update(False)
140 |             current_reward = 0.1
141 |             self.steps_without_apple += 1
142 |             # if self.steps_without_apple > 20:
143 |             #     current_reward = 0
144 |             if self.steps_without_apple > 500:
145 |                 current_reward = -1
146 |                 self.game_over()
147 | 
148 |         if self.snake.check_if_hit_wall():
149 |             current_reward = -1
150 |             self.game_over()
151 | 
152 |         if self.snake.check_if_ate_self():
153 |             current_reward = -1
154 |             self.game_over()
155 | 
156 |         if self.draw:
157 |             self.screen.fill(BACKGROUND)
158 |             self.snake.draw()
159 |             self.apple.draw()
160 |             pygame.display.update()
161 | 
162 |         obs = self.get_observation_space()
163 |         time.sleep(self.fps / 1000.0)
164 | 
165 |         return obs, current_reward, self.is_done, None
166 | 
167 |     def get_observation_space(self):
168 | 
169 |         new_obs = []
170 | 
171 |         # create 2d matrix
172 |         for i in range(int(WIDTH / SCALE)):
173 |             new_obs.append([])
174 |             for j in range(int(WIDTH / SCALE)):
175 |                 new_obs[i].append(-1)
176 | 
177 |         # add apple
178 |         x_apple = int(self.apple.x / SCALE)
179 |         y_apple = int(self.apple.y / SCALE)
180 |         new_obs[y_apple][x_apple] = 1
181 | 
182 |         # add snake
183 |         x_snake = int(self.snake.x / SCALE)
184 |         y_snake = int(self.snake.y / SCALE)
185 |         new_obs[y_snake][x_snake] = 0.8
186 | 
187 |         # tail
188 |         for i in self.snake.tail:
189 |             x_snake = int(i.x / SCALE)
190 |             y_snake = int(i.y / SCALE)
191 |             new_obs[y_snake][x_snake] = 0.5
192 | 
193 |         current_obs = []
194 |         for i in new_obs:
195 |             for j in i:
196 |                 current_obs.append(j)
197 | 
198 |         if self.draw and self.debug:
199 |             for i in new_obs:
200 |                 print(i, '\n')
201 |             print('\n')
202 | 
203 |         return_obs = np.array(current_obs)
204 | 
205 |         #######
206 |         # if self.last_observation == None:
207 |         #     self.last_observation = current_obs
208 | 
209 |         # return_obs = []
210 | 
211 |         # for i in self.last_observation:
212 |         #     return_obs.append(i)
213 |         # for i in current_obs:
214 |         #     return_obs.append(i)
215 | 
216 |         # return_obs = np.array(return_obs)
217 | 
218 |         # cnt = 0
219 |         # for i in return_obs:
220 |         #     cnt += 1
221 |         #     print(' ', i, ' ', end='')
222 |         #     if cnt % 10 == 0:
223 |         #         print('')
224 |         #     if cnt % 100 == 0:
225 |         #         print('')
226 |         #         print('')
227 |         # print('')
228 | 
229 |         # self.last_observation = current_obs
230 |         #######
231 | 
232 |         return return_obs
233 | 
234 |     def get_action_random(self):
235 |         return random.randint(0, 3)
236 | 
237 |     # HUMAN STUFF ############################################################
238 | 
239 |     def reset_human_game(self):
240 |         """ Resets the whole environment. Must be called in the beginning. """
241 | 
242 |         self.clock = pygame.time.Clock()
243 |         self.time_elapsed_since_last_action = 0
244 |         self.global_time = 0
245 | 
246 |         self.screen = pygame.display.set_mode(SCREEN_SIZE)
247 |         self.snake = Snake(self.screen, WIDTH, HEIGHT, SNAKE_COLOR,
248 |                            BACKGROUND, SCALE)
249 |         self.apple = Apple(self.screen, WIDTH, HEIGHT, APPPLE_COLOR, SCALE)
250 | 
251 |         self.reward = 0
252 |         self.score = 0
253 |         self.is_done = False
254 |         self.steps_without_apple = 0
255 | 
256 |         self.current_observation = None
257 |         self.last_observation = None
258 | 
259 |         if self.draw:
260 |             self.countdown()
261 | 
262 |     def run_human_game(self):
263 | 
264 |         while not self.is_done:
265 | 
266 |             self.handle_events_human()
267 |             self.snake.handle_events_human()
268 | 
269 |             if self.apple.eat(self.snake.x, self.snake.y, self.snake.tail):
270 |                 self.snake.update(True)
271 |             else:
272 |                 self.snake.update(False)
273 | 
274 |             if self.snake.check_if_hit_wall():
275 |                 self.game_over()
276 | 
277 |             if self.snake.check_if_ate_self():
278 |                 self.game_over()
279 | 
280 |             if self.draw:
281 |                 self.screen.fill(BACKGROUND)
282 |                 self.snake.draw()
283 |                 self.apple.draw()
284 |                 pygame.display.update()
285 | 
286 |             time.sleep (self.fps / 1000.0);
287 | 
288 |     def handle_events_human(self):
289 |         for event in pygame.event.get():
290 |             if event.type == pygame.QUIT:
291 |                 self.is_done = False
292 |                 pygame.quit()
293 | 
294 |     def countdown(self):
295 |         if not self.animation:
296 |             return
297 |         for _ in range(3, 0, -1):
298 |             self.screen.fill(BACKGROUND)
299 |             self.snake.draw()
300 |             self.apple.draw()
301 |             text_start = pygame.font.SysFont(FONT, 80). \
302 |                 render("Start in  {}".format(_), True, (0, 0, 0))
303 |             self.screen.blit(text_start,
304 |                              (text_start.get_width() //
305 |                               2, text_start.get_height() // 2))
306 |             pygame.display.flip()
307 |             time.sleep(0.5)
308 | 
309 |     def game_over(self):
310 |         self.is_done = True
311 |         if not self.animation:
312 |             return
313 |         if self.draw:
314 |             text = pygame.font.SysFont(FONT, 28).render(
315 |                 "Game Over!".format(self.reward), True, (0, 0, 0))
316 |             self.screen.blit(text, (320 - text.get_width() //
317 |                                     2, 240 - text.get_height() // 2))
318 |             pygame.display.flip()
319 |             time.sleep(0.5)
320 | 
321 | 
322 | 
323 | 
324 | 
325 | 
326 |         # if self.last_observation == None:
327 |         #     self.current_observation = current_obs
328 | 
329 |         # self.last_observation = self.current_observation
330 |         # self.current_observation = current_obs
331 | 
332 |         # return_obs = []
333 | 
334 |         # for i in self.last_observation:
335 |         #     return_obs.append(i)
336 | 
337 |         # for i in self.current_observation:
338 |         #     return_obs.append(i)
339 | 
340 |         # current_obs = np.array(current_obs)
341 | 
342 |         # for i in range(25):
343 |         #     if i%5==0:
344 |         #         print('')
345 |         #     print(' ' , self.last_observation[i] , ' ' , end='')
346 | 
347 |         # print('')
348 |         # for i in range(25):
349 |         #     if i%5==0:
350 |         #         print('')
351 |         #     print(' ' ,self.current_observation[i], ' ' , end='')
352 | 


--------------------------------------------------------------------------------
/better_snake/old_env/snake.py:
--------------------------------------------------------------------------------
  1 | # https://www.youtube.com/watch?v=AaGK-fj-BAM&t=630s
  2 | import pygame
  3 | 
  4 | 
  5 | class Snake:
  6 | 
  7 |     def __init__(self, screen, s_width, s_height, color, body_color, scale):
  8 | 
  9 |         self.screen = screen
 10 |         self.s_width = s_width
 11 |         self.s_height = s_height
 12 |         self.color = color
 13 |         self.body_color = body_color
 14 |         self.scale = scale
 15 | 
 16 |         self.scale = scale
 17 | 
 18 |         self.x = 2 * scale
 19 |         self.y = 2 * scale
 20 | 
 21 |         self.x_speed = 1
 22 |         self.y_speed = 0
 23 | 
 24 |         self.tail = [Vector(self.x, self.y)]
 25 | 
 26 |     def handle_events_human(self):
 27 |         keys = pygame.key.get_pressed()
 28 |         if keys[pygame.K_UP]:
 29 |             self.move(0, -1)
 30 |         if keys[pygame.K_RIGHT]:
 31 |             self.move(1, 0)
 32 |         if keys[pygame.K_DOWN]:
 33 |             self.move(0, 1)
 34 |         if keys[pygame.K_LEFT]:
 35 |             self.move(-1, 0)
 36 | 
 37 |     def handle_events_ai(self, action):
 38 |         # print(action)
 39 |         if action == 0:
 40 |             self.move(0, -1)
 41 |         if action == 1:
 42 |             self.move(1, 0)
 43 |         if action == 2:
 44 |             self.move(0, 1)
 45 |         if action == 3:
 46 |             self.move(-1, 0)
 47 | 
 48 |     def draw(self):
 49 | 
 50 |         for i in self.tail:
 51 |             rect = pygame.rect.Rect(
 52 |                 i.x + 1, i.y + 1, self.scale - 2, self.scale - 2)
 53 |             pygame.draw.rect(self.screen, self.color, rect)
 54 |             rect = pygame.rect.Rect(
 55 |                 i.x + 16, i.y + 16, self.scale - 32, self.scale - 32)
 56 |             pygame.draw.rect(self.screen, self.body_color, rect)
 57 | 
 58 |         rect = pygame.rect.Rect(
 59 |             self.x, self.y, self.scale, self.scale)
 60 |         pygame.draw.rect(self.screen, self.color, rect)
 61 | 
 62 |     def update(self, ate_apple):
 63 | 
 64 |         length = len(self.tail)
 65 | 
 66 |         if ate_apple:
 67 |             self.tail.append(Vector(self.x, self.y))
 68 |         else:
 69 |             for i in range(length - 1):
 70 |                 self.tail[i] = self.tail[i + 1]
 71 |             self.tail[length - 1] = Vector(self.x, self.y)
 72 | 
 73 |         self.x = self.x + self.x_speed * self.scale
 74 |         self.y = self.y + self.y_speed * self.scale
 75 | 
 76 |         if self.x < 0:
 77 |             self.x = 0
 78 |         if self.x > self.s_width - self.scale:
 79 |             self.x = self.s_width - self.scale
 80 |         if self.y < 0:
 81 |             self.y = 0
 82 |         if self.y > self.s_height - self.scale:
 83 |             self.y = self.s_height - self.scale
 84 | 
 85 |     def move(self, x, y):
 86 |         self.x_speed = x
 87 |         self.y_speed = y
 88 | 
 89 |     def check_if_hit_wall(self):
 90 |         if self.x == -1:
 91 |             return True
 92 |         if self.x == self.s_width:
 93 |             return True
 94 |         if self.y == -1:
 95 |             return True
 96 |         if self.y == self.s_height:
 97 |             return True
 98 | 
 99 |     def check_if_ate_self(self):
100 |         for i in self.tail:
101 |             if (self.x == i.x) and (self.y == i.y):
102 |                 return True
103 | 
104 | 
105 | class Vector:
106 | 
107 |     def __init__(self, x, y):
108 |         self.x = x
109 |         self.y = y
110 | 


--------------------------------------------------------------------------------
/better_snake/old_env_2/cube.py:
--------------------------------------------------------------------------------
 1 | import pygame
 2 | 
 3 | 
 4 | class Cube(object):
 5 | 
 6 |     def __init__(self, pos, rows, w, dirnx=1, dirny=0, color=(255, 0, 0)):
 7 | 
 8 |         self.pos = pos
 9 |         self.dirnx = dirnx
10 |         self.dirny = dirny
11 | 
12 |         self.rows = rows
13 |         self.w = w
14 | 
15 |         self.color = color
16 | 
17 |     def move(self, dirnx, dirny):
18 |         self.dirnx = dirnx
19 |         self.dirny = dirny
20 |         self.pos = (self.pos[0] + self.dirnx, self.pos[1] + self.dirny)
21 | 
22 |     def draw(self, surface, eyes=False):
23 |         dis = self.w // self.rows
24 |         i = self.pos[0]
25 |         j = self.pos[1]
26 | 
27 |         pygame.draw.rect(surface, self.color, (i*dis+1,j*dis+1, dis-2, dis-2))
28 |         if eyes:
29 |             centre = dis//2
30 |             radius = 3
31 |             circleMiddle = (i*dis+centre-radius,j*dis+8)
32 |             circleMiddle2 = (i*dis + dis -radius*2, j*dis+8)
33 |             pygame.draw.circle(surface, (0,0,0), circleMiddle, radius)
34 |             pygame.draw.circle(surface, (0,0,0), circleMiddle2, radius)


--------------------------------------------------------------------------------
/better_snake/old_env_2/environment.py:
--------------------------------------------------------------------------------
  1 | from environment.cube import Cube
  2 | from environment.snake import Snake
  3 | 
  4 | import gym
  5 | import pygame
  6 | 
  7 | import numpy as np
  8 | import random
  9 | import enum
 10 | import time
 11 | 
 12 | #import tkinker as tk
 13 | #from tkinter import messagebox
 14 | 
 15 | # snake obs
 16 | # body = head 0.9, b[0] = 0.8, b[1] = 0.79 ...
 17 | 
 18 | 
 19 | W = 500
 20 | H = 500
 21 | BUFFER_SIZE = 1
 22 | 
 23 | 
 24 | class Actions(enum.Enum):
 25 |     Up = 0
 26 |     Right = 1
 27 |     Down = 2
 28 |     Left = 3
 29 | 
 30 | 
 31 | class SnakeEnvironment(gym.Env):
 32 | 
 33 |     def __init__(self, draw=True, speed=10000, rows=20, animation=True):
 34 |         super(SnakeEnvironment, self).__init__()
 35 | 
 36 | 
 37 |         self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(rows, rows), dtype=np.uint8)
 38 |         self.action_space = gym.spaces.Discrete(n=len(Actions))
 39 | 
 40 |         self.draw = draw
 41 |         self.speed = speed
 42 |         self.rows = rows
 43 |         self.animation = animation
 44 | 
 45 |         self.snake = Snake((255, 0, 0), (2, 2), self.rows, W)
 46 |         self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0))
 47 | 
 48 |         self.is_done = False
 49 |         self.reward = 0
 50 |         self.step_without_apple = 0
 51 | 
 52 |         self.surf = pygame.display.set_mode((W, H))
 53 |         self.clock = pygame.time.Clock()
 54 | 
 55 |         if draw:
 56 |             pygame.init()
 57 |             self.font_game_over = pygame.font.SysFont("ani", 72)
 58 | 
 59 |     """ Must alwasy be calles in the beginning. """
 60 |     def reset(self):
 61 |         self.countdown()
 62 | 
 63 |         self.snake.reset((2, 2))
 64 |         self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0))
 65 |         self.is_done = False
 66 |         self.reward = 0
 67 |         self.step_without_apple = 0
 68 | 
 69 |         self.surf = pygame.display.set_mode((W, H))
 70 |         self.clock = pygame.time.Clock()
 71 | 
 72 |         obs, reward, is_done, _ = self.step(1)
 73 | 
 74 |         return obs
 75 | 
 76 |     def step(self, action):
 77 |         pygame.time.delay(50)                      # lower is faster
 78 |         self.clock.tick(self.speed)                # lower is slower
 79 | 
 80 |         current_reward = 0
 81 | 
 82 |         self.snake.move_ai(action)
 83 |         # self.snake.move_human()
 84 | 
 85 |         if self.snake.ate_itself():
 86 |             current_reward = -1
 87 |             self.game_over()
 88 | 
 89 |         self.step_without_apple += 1
 90 |         if self.step_without_apple == 250:
 91 |             self.game_over()
 92 | 
 93 |         if self.snake.body[0].pos == self.snack.pos:
 94 |             self.snake.add_cube()
 95 |             self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0))
 96 |             self.reward += 1
 97 |             current_reward = 1
 98 |             self.step_without_apple = 0
 99 | 
100 |         self.redraw_window()
101 | 
102 |         obs = self.get_observation_space()
103 | 
104 |         return obs, current_reward, self.is_done, {}
105 | 
106 |     def get_observation_space(self):
107 | 
108 |         new_obs = []
109 | 
110 |         # create 2d matrix
111 |         for i in range(self.rows):
112 |             new_obs.append([])
113 |             for j in range(self.rows):
114 |                 new_obs[i].append(-1)
115 | 
116 |         # add apple
117 |         x_apple = self.snack.pos[0]
118 |         y_apple = self.snack.pos[1]
119 |         new_obs[y_apple][x_apple] = 1
120 | 
121 |         # tail
122 |         for i, c in enumerate(self.snake.body):
123 |             x_snake = c.pos[0]
124 |             y_snake = c.pos[1]
125 | 
126 |             if x_snake == -1 or x_snake == self.rows:
127 |                 print('Wtf, this error occured!')
128 |                 self.game_over()
129 |                 return
130 |             if y_snake == -1 or y_snake == self.rows:
131 |                 print('Wtf, this error occured!')
132 |                 self.game_over()
133 |                 return
134 | 
135 |             new_obs[y_snake][x_snake] = 0.5
136 | 
137 |         # add snake head
138 |         x_snake = self.snake.head.pos[0]
139 |         y_snake = self.snake.head.pos[1]
140 |         if x_snake == -1 or x_snake == self.rows:
141 |             print('Wtf, this error occured!')
142 |             self.game_over()
143 |             return
144 |         if y_snake == -1 or y_snake == self.rows:
145 |             print('Wtf, this error occured!')
146 |             self.game_over()
147 |             return
148 |         new_obs[y_snake][x_snake] = 0.8
149 | 
150 |         # current_obs = []
151 |         # for i in new_obs:
152 |         #     for j in i:
153 |         #         current_obs.append(j)
154 | 
155 |         # cnt = 0
156 |         # for i in current_obs:
157 |         #     cnt += 1
158 |         #     print(' ', i, ' ', end='')
159 |         #     if cnt % self.rows == 0:
160 |         #         print('')
161 |         # print('')
162 | 
163 |         # return_obs = np.array(current_obs)
164 | 
165 |         # print(new_obs)
166 | 
167 |         # time.sleep(10)
168 | 
169 |         return new_obs
170 | 
171 |     def draw_grid(self):
172 |         size_btwn = W // self.rows
173 | 
174 |         x = 0
175 |         y = 0
176 | 
177 |         for i in range(self.rows):
178 |             x = x + size_btwn
179 |             y = y + size_btwn
180 | 
181 |             pygame.draw.line(self.surf, (255, 255, 255), (x, 0), (x, W))
182 |             pygame.draw.line(self.surf, (255, 255, 255), (0, y), (W, y))
183 | 
184 |     def redraw_window(self):
185 |         if not self.draw:
186 |             return
187 | 
188 |         self.surf.fill((0, 0, 0))
189 |         self.draw_grid()
190 |         self.snake.draw(self.surf)
191 |         self.snack.draw(self.surf)
192 | 
193 |         pygame.display.update()
194 | 
195 |     def random_snack(self):
196 |         positions = self.snake.body
197 |      
198 |         while True:
199 |             x = random.randrange(self.rows)
200 |             y = random.randrange(self.rows)
201 |             if len(list(filter(lambda z:z.pos == (x,y), positions))) > 0:
202 |                 continue
203 |             else:
204 |                 break
205 |         return (x,y)
206 | 
207 |     def countdown(self):
208 |         if not self.draw or not self.animation:
209 |             return
210 |         for _ in range(3, 0, -1):
211 |             self.write_text("Start in {}".format(_))
212 |             time.sleep(0.3)
213 | 
214 |     def game_over(self):
215 |         self.is_done = True
216 |         if not self.draw or not self.animation:
217 |             return
218 |         self.write_text("Score {}".format(self.reward))
219 |         time.sleep(1.5)
220 | 
221 |     def write_text(self, text):
222 |         self.redraw_window()
223 |         text_start = pygame.font.SysFont('dyuthi', 80). \
224 |             render(text, True, (255, 255, 255))
225 |         self.surf.blit(text_start,
226 |                          (text_start.get_width() //
227 |                           2, text_start.get_height() // 2))
228 |         pygame.display.flip()
229 | 
230 |     def play_human(self):
231 |         self.countdown()
232 | 
233 |         while(not self.is_done):
234 |             pygame.time.delay(50)                      # lower is faster
235 |             self.clock.tick(self.speed)                # lower is slower
236 | 
237 |             self.snake.move_human()
238 | 
239 |             if self.snake.ate_itself():
240 |                 self.game_over()
241 | 
242 |             if self.snake.body[0].pos == self.snack.pos:
243 |                 self.snake.add_cube()
244 |                 self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0))
245 |                 self.reward += 1
246 | 
247 |             self.redraw_window()
248 |             self.get_observation_space()
249 | 
250 | 
251 | if __name__ == "__main__":
252 |     env = SnakeEnvironment(draw=True, speed=100, rows=5)
253 |     env.play_human()
254 | 
255 | 
256 | 
257 | 
258 |         #######
259 |         # if self.last_observation == None:
260 |         #     self.last_observation = current_obs
261 | 
262 |         # return_obs = []
263 | 
264 |         # for i in self.last_observation:
265 |         #     return_obs.append(i)
266 |         # for i in current_obs:
267 |         #     return_obs.append(i)
268 | 
269 |         # return_obs = np.array(return_obs)
270 | 
271 |         # cnt = 0
272 |         # for i in return_obs:
273 |         #     cnt += 1
274 |         #     print(' ', i, ' ', end='')
275 |         #     if cnt % 10 == 0: 
276 |         #         print('')
277 |         #     if cnt % 100 == 0:
278 |         #         print('')
279 |         #         print('')
280 |         # print('')
281 | 
282 |         # self.last_observation = current_obs
283 |         #######
284 | 


--------------------------------------------------------------------------------
/better_snake/old_env_2/self_play.py:
--------------------------------------------------------------------------------
 1 | # from environment.environment import SnakeEnvironment
 2 | 
 3 | env = SnakeEnvironment(draw=True, speed=100000, rows=5)
 4 | 
 5 | env.reset()
 6 | terminal = False
 7 | 
 8 | while not terminal:
 9 |     action = random.randint(0, 4)
10 |     next_state, reward, is_done, _ = env.step(action)
11 |     terminal = is_done
12 | 


--------------------------------------------------------------------------------
/better_snake/old_env_2/snake.py:
--------------------------------------------------------------------------------
  1 | from environment.cube import Cube
  2 | 
  3 | import pygame
  4 | 
  5 | 
  6 | class Snake(object):
  7 | 
  8 |     body = []
  9 |     turns = {}
 10 | 
 11 |     def __init__(self, color, pos, rows, w):
 12 |         self.head = Cube(pos, rows, w)
 13 |         self.body.append(self.head)
 14 | 
 15 |         self.rows = rows
 16 |         self.w = w
 17 | 
 18 |         self.color = color
 19 | 
 20 |         self.dirnx = 0
 21 |         self.dirny = 0
 22 | 
 23 |         self.add_cube()
 24 |         self.add_cube()
 25 | 
 26 |     def move_ai(self, action):
 27 |         x = self.head.pos[0]
 28 |         y = self.head.pos[1]
 29 | 
 30 |         if y == 0 and action == 0:
 31 |             action = -1
 32 |         elif x == self.rows -1 and action == 1:
 33 |             action = -1
 34 |         elif y == self.rows -1 and action == 2:
 35 |             action = -1
 36 |         elif x == 0 and action == 3:
 37 |             action = -1
 38 | 
 39 |         if action == -1:
 40 |             pass
 41 |         elif action == 0:
 42 |             self.dirnx = 0
 43 |             self.dirny = -1
 44 |             self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 45 |         elif action == 1:
 46 |             self.dirnx = 1
 47 |             self.dirny = 0
 48 |             self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 49 |         elif action == 2:
 50 |             self.dirnx = 0
 51 |             self.dirny = 1
 52 |             self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 53 |         elif action == 3:
 54 |             self.dirnx = -1
 55 |             self.dirny = 0
 56 |             self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 57 | 
 58 |         for i, c in enumerate(self.body):
 59 |             p = c.pos[:]
 60 |             if p in self.turns:
 61 |                 turn = self.turns[p]
 62 |                 c.move(turn[0],turn[1])
 63 |                 if i == len(self.body)-1:
 64 |                     self.turns.pop(p)
 65 |             else:
 66 |                 if c.dirnx == -1 and c.pos[0] <= 0: c.pos = (c.rows-1, c.pos[1])
 67 |                 elif c.dirnx == 1 and c.pos[0] >= c.rows-1: c.pos = (0,c.pos[1])
 68 |                 elif c.dirny == 1 and c.pos[1] >= c.rows-1: c.pos = (c.pos[0], 0)
 69 |                 elif c.dirny == -1 and c.pos[1] <= 0: c.pos = (c.pos[0],c.rows-1)
 70 |                 else: c.move(c.dirnx,c.dirny)
 71 | 
 72 |     def move_human(self):
 73 |         for event in pygame.event.get():
 74 |             if event.type == pygame.QUIT:
 75 |                 pygame.quit()
 76 | 
 77 |             keys = pygame.key.get_pressed()
 78 |             for key in keys:
 79 |                 if keys[pygame.K_UP]:
 80 |                     self.dirnx = 0
 81 |                     self.dirny = -1
 82 |                     self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 83 |                 elif keys[pygame.K_RIGHT]:
 84 |                     self.dirnx = 1
 85 |                     self.dirny = 0
 86 |                     self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 87 |                 elif keys[pygame.K_DOWN]:
 88 |                     self.dirnx = 0
 89 |                     self.dirny = 1
 90 |                     self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 91 |                 elif keys[pygame.K_LEFT]:
 92 |                     self.dirnx = -1
 93 |                     self.dirny = 0
 94 |                     self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 95 | 
 96 |         for i, c in enumerate(self.body):
 97 |             p = c.pos[:]
 98 |             if p in self.turns:
 99 |                 turn = self.turns[p]
100 |                 c.move(turn[0],turn[1])
101 |                 if i == len(self.body)-1:
102 |                     self.turns.pop(p)
103 |             else:
104 |                 if c.dirnx == -1 and c.pos[0] <= 0: c.pos = (c.rows-1, c.pos[1])
105 |                 elif c.dirnx == 1 and c.pos[0] >= c.rows-1: c.pos = (0,c.pos[1])
106 |                 elif c.dirny == 1 and c.pos[1] >= c.rows-1: c.pos = (c.pos[0], 0)
107 |                 elif c.dirny == -1 and c.pos[1] <= 0: c.pos = (c.pos[0],c.rows-1)
108 |                 else: c.move(c.dirnx,c.dirny)
109 | 
110 |     def ate_itself(self):
111 |         head = True
112 |         for i, c in enumerate(self.body):
113 |             if self.head.pos == c.pos and not head:
114 |                  return True
115 |             head = False
116 | 
117 |     def reset(self, pos):
118 |         self.head = Cube(pos, self.rows, self.w)
119 |         self.body = []
120 |         self.body.append(self.head)
121 |         self.turns = {}
122 |         self.dirnx = 0
123 |         self.dirny = 1
124 |         self.add_cube()
125 |         self.add_cube()
126 | 
127 |     def add_cube(self):
128 |         tail = self.body[-1]
129 |         dx, dy = tail.dirnx, tail.dirny
130 | 
131 |         if dx == 1 and dy == 0:
132 |             self.body.append(Cube((tail.pos[0] -1, tail.pos[1]), self.rows, self.w))
133 |         elif dx == -1 and dy == 0:
134 |             self.body.append(Cube((tail.pos[0] +1, tail.pos[1]), self.rows, self.w))
135 |         elif dx == 0 and dy == 1:
136 |             self.body.append(Cube((tail.pos[0], tail.pos[1] -1), self.rows, self.w))
137 |         elif dx == 0 and dy == -1:
138 |             self.body.append(Cube((tail.pos[0], tail.pos[1] +1), self.rows, self.w))
139 | 
140 |         self.body[-1].dirnx = dx
141 |         self.body[-1].dirny = dy
142 | 
143 |     def draw(self, surface):
144 |         for i, c in enumerate(self.body):
145 |         	if i == 0:
146 |         		c.draw(surface, True)
147 |         	else:
148 |         		c.draw(surface)
149 | 


--------------------------------------------------------------------------------
/better_snake/ppo.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import gym_snake
 3 | 
 4 | from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
 5 | from stable_baselines.common.policies import MlpPolicy
 6 | from stable_baselines import PPO1
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	# env = gym.make('snake-v0')
11 | 	env = gym.make('snake-v0')
12 | 	# env = SubprocVecEnv([lambda: env])
13 | 	env = DummyVecEnv([lambda: env])
14 | 
15 | 	model = PPO1(MlpPolicy, env, verbose=1)
16 | 
17 | 	model.learn(total_timesteps=500000)
18 | 	model.save('models/snake-bastard')
19 | 
20 | 	###############################################################################
21 | 
22 | 	# env = gym.make('snake-v0')
23 | 	# # env = DummyVecEnv([lambda: env])
24 | 
25 | 	# # model = PPO2(MlpPolicy, env, verbose=1)
26 | 	# # model.load('models/snake-basterd')
27 | 
28 | 	# obs = env.reset()
29 | 	# is_done = False
30 | 
31 | 	# while not is_done:
32 | 	#     action, _states = model.predict(obs)
33 | 	#     obs, rewards, terminal, info = env.step(action)
34 | 	#     is_done = terminal
35 | 	#     env.render()
36 | 


--------------------------------------------------------------------------------
/flappyb/dqn_rainbow.py:
--------------------------------------------------------------------------------
  1 | from environment.environment import Environment
  2 | import ptan
  3 | import argparse
  4 | import numpy as np
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | import torch.optim as optim
 11 | 
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from lib import dqn_model
 15 | from lib import common
 16 | 
 17 | 
 18 | MODEL_NAME = "flappyb-test-the-rainbow"
 19 | NUMBER_NEURONS = 512
 20 | WRITE = False
 21 | 
 22 | # n-step
 23 | REWARD_STEPS = 2
 24 | 
 25 | # priority replay
 26 | PRIO_REPLAY_ALPHA = 0.6
 27 | BETA_START = 0.4
 28 | BETA_FRAMES = 100000
 29 | 
 30 | # C51
 31 | Vmax = 10
 32 | Vmin = -10
 33 | N_ATOMS = 51
 34 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
 35 | 
 36 | 
 37 | class RainbowDQN(nn.Module):
 38 |     def __init__(self, input_shape, n_actions):
 39 |         super(RainbowDQN, self).__init__()
 40 | 
 41 |         self.fc_val = nn.Sequential(
 42 |             dqn_model.NoisyLinear(input_shape, NUMBER_NEURONS),
 43 |             nn.ReLU(),
 44 |             dqn_model.NoisyLinear(NUMBER_NEURONS, N_ATOMS)
 45 |         )
 46 | 
 47 |         self.fc_adv = nn.Sequential(
 48 |             dqn_model.NoisyLinear(input_shape, NUMBER_NEURONS),
 49 |             nn.ReLU(),
 50 |             dqn_model.NoisyLinear(NUMBER_NEURONS, n_actions * N_ATOMS)
 51 |         )
 52 | 
 53 |         self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z))
 54 |         self.softmax = nn.Softmax(dim=1)
 55 | 
 56 |     def forward(self, x):
 57 |         batch_size = x.size()[0]
 58 |         fx = x.float() / NUMBER_NEURONS
 59 |         val_out = self.fc_val(fx).view(batch_size, 1, N_ATOMS)
 60 |         adv_out = self.fc_adv(fx).view(batch_size, -1, N_ATOMS)
 61 |         adv_mean = adv_out.mean(dim=1, keepdim=True)
 62 |         return val_out + (adv_out - adv_mean)
 63 | 
 64 |     def both(self, x):
 65 |         cat_out = self(x)
 66 |         probs = self.apply_softmax(cat_out)
 67 |         weights = probs * self.supports
 68 |         res = weights.sum(dim=2)
 69 |         return cat_out, res
 70 | 
 71 |     def qvals(self, x):
 72 |         return self.both(x)[1]
 73 | 
 74 |     def apply_softmax(self, t):
 75 |         return self.softmax(t.view(-1, N_ATOMS)).view(t.size())
 76 | 
 77 | 
 78 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
 79 |     states, actions, rewards, dones, next_states = common.unpack_batch(batch)
 80 |     batch_size = len(batch)
 81 | 
 82 |     states_v = torch.tensor(states).to(device)
 83 |     actions_v = torch.tensor(actions).to(device)
 84 |     next_states_v = torch.tensor(next_states).to(device)
 85 |     batch_weights_v = torch.tensor(batch_weights).to(device)
 86 | 
 87 |     # next state distribution
 88 |     # dueling arch -- actions from main net, distr from tgt_net
 89 | 
 90 |     # calc at once both next and cur states
 91 |     distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
 92 |     next_qvals_v = qvals_v[batch_size:]
 93 |     distr_v = distr_v[:batch_size]
 94 | 
 95 |     next_actions_v = next_qvals_v.max(1)[1]
 96 |     next_distr_v = tgt_net(next_states_v)
 97 |     next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
 98 |     next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
 99 |     next_best_distr = next_best_distr_v.data.cpu().numpy()
100 | 
101 |     dones = dones.astype(np.bool)
102 | 
103 |     # project our distribution using Bellman update
104 |     proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma)
105 | 
106 |     # calculate net output
107 |     state_action_values = distr_v[range(batch_size), actions_v.data]
108 |     state_log_sm_v = F.log_softmax(state_action_values, dim=1)
109 |     proj_distr_v = torch.tensor(proj_distr).to(device)
110 | 
111 |     loss_v = -state_log_sm_v * proj_distr_v
112 |     loss_v = batch_weights_v * loss_v.sum(dim=1)
113 |     return loss_v.mean(), loss_v + 1e-5
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     params = common.HYPERPARAMS['flappyb']
118 |     params['epsilon_frames'] *= 2
119 |     parser = argparse.ArgumentParser()
120 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
121 |     args = parser.parse_args()
122 |     device = torch.device("cuda" if args.cuda else "cpu")
123 | 
124 |     env = Environment(draw=False, fps=1, debug=False,
125 |                       dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True)
126 |     writer = None
127 |     if WRITE:
128 |         writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow")
129 |     net = RainbowDQN(env.observation_space.n, env.action_space.n).to(device)
130 |     tgt_net = ptan.agent.TargetNet(net)
131 |     agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device)
132 | 
133 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS)
134 |     buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
135 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
136 | 
137 |     frame_idx = 0
138 |     beta = BETA_START
139 | 
140 |     with common.RewardTracker(MODEL_NAME, net, writer, params['stop_reward']) as reward_tracker:
141 |         while True:
142 |             frame_idx += 1
143 |             buffer.populate(1)
144 |             beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)
145 | 
146 |             new_rewards = exp_source.pop_total_rewards()
147 |             if new_rewards:
148 |                 if reward_tracker.reward(new_rewards[0], frame_idx):
149 |                     break
150 | 
151 |             if len(buffer) < params['replay_initial']:
152 |                 continue
153 | 
154 |             optimizer.zero_grad()
155 |             batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta)
156 |             loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model,
157 |                                                params['gamma'] ** REWARD_STEPS, device=device)
158 |             loss_v.backward()
159 |             optimizer.step()
160 |             buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy())
161 | 
162 |             if frame_idx % params['target_net_sync'] == 0:
163 |                 tgt_net.sync()
164 | 


--------------------------------------------------------------------------------
/flappyb/dqn_v2.py:
--------------------------------------------------------------------------------
  1 | # Made with the help of
  2 | # https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288
  3 | 
  4 | import random
  5 | import numpy as np
  6 | from collections import deque
  7 | 
  8 | import keras
  9 | from keras.models import Sequential
 10 | from keras.layers import Dense
 11 | from keras.optimizers import Adam
 12 | from keras.models import load_model
 13 | 
 14 | from environment.environment import Environment
 15 | 
 16 | # from tensorboardX import SummaryWriter
 17 | 
 18 | GAMMA = 0.9                     # try .99
 19 | LEARNING_RATE = 0.001           # default is 0.001
 20 | LEARNING_WITH_DECAY = 0.01
 21 | 
 22 | MEMORY_SIZE = 1000000
 23 | BATCH_SIZE = 32
 24 | 
 25 | EXPLORATION_MAX = 0.5
 26 | EXPLORATION_MIN = 0.01
 27 | EXPLORATION_DECAY = 0.99995
 28 | 
 29 | # PARAMETERS ##################################################################
 30 | LEARN = True                   # False if using a trained model
 31 | 
 32 | NAME = 'dqn-loadedHARDCORE=6300-grav=2.5-dist-pipes=220-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-nextPipe'
 33 | WRITE = False                    # Only for training
 34 | DRAW = False                    # Only for training
 35 | SAVE_MODEL = True               # Only for training
 36 | 
 37 | OBS_THIS_PIPE_PLAY = False       # False for HELL, changes observation
 38 | OBS_THIS_PIPE_LEARN = False
 39 | DIFFICULTY_PLAY = 45            # 160 is easy, 70 is hardcore, 45 is hell
 40 | DIFFICULTY_LEARN = 45
 41 | 
 42 | DIST_BETWEEN_PIPES = 220        # default is 220
 43 | 
 44 | # Here you can load trained models:
 45 | 
 46 | # LOAD_NAME = 'dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=6650' # 950 is oke # 1050 is oke # 6650 My baby is back <3 # 2600 is pretty good # 6300 is god
 47 | # LOAD_NAME = 'dqn-expdecay=0.999995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-HARDCORE-PART=6300'
 48 | LOAD_NAME = 'dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-LOADED=HARDCORE-6300-lrMax=0.4-nextPipe-HELL-PART=1000' # NEXT_PIPE # 1000 is really good!
 49 | #####################################################################################################
 50 | 
 51 | 
 52 | class DQNSolver:
 53 | 
 54 |     def __init__(self, observation_space, action_space, model=None):
 55 |         self.exploration_rate = EXPLORATION_MAX
 56 | 
 57 |         self.action_space = action_space
 58 |         self.memory = deque(maxlen=MEMORY_SIZE)
 59 | 
 60 |         if model is None:
 61 |             print('new model')
 62 |             self.model = Sequential()
 63 |             # andere aktivierungs funktion
 64 |             self.model.add(Dense(512, input_shape=(
 65 |                 observation_space,), activation="relu"))
 66 |             self.model.add(Dense(512, activation="relu"))
 67 |             # self.model.add(Dropout(0.85))
 68 |             # self.model.add(Dense(512, activation="relu"))
 69 |             # Linear sucks? maybe try softmax
 70 |             self.model.add(Dense(self.action_space, activation="linear"))
 71 |             self.model.compile(loss="mse", optimizer=Adam(
 72 |                 lr=LEARNING_RATE))    # Try learning rate deacy
 73 |             # self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_WITH_DECAY, decay=1e-6))
 74 |         else:
 75 |             print('saved model loaded')
 76 |             self.model = model
 77 | 
 78 |     def remember(self, state, action, reward, next_state, done):
 79 |         self.memory.append((state, action, reward, next_state, done))
 80 | 
 81 |     def act(self, state, env):
 82 |         if np.random.rand() < self.exploration_rate:
 83 |             return env.get_action_random()
 84 |         q_values = self.model.predict(state)
 85 |         return np.argmax(q_values[0])
 86 | 
 87 |     def act_free(self, state):
 88 |         q_values = self.model.predict(state)
 89 |         return np.argmax(q_values[0])
 90 | 
 91 |     def experience_replay(self):
 92 |         if len(self.memory) < BATCH_SIZE:
 93 |             return
 94 |         batch = random.sample(self.memory, BATCH_SIZE)
 95 |         for state, action, reward, state_next, terminal in batch:
 96 |             q_update = reward
 97 |             if not terminal:
 98 |                 q_update = (reward + GAMMA *
 99 |                             np.amax(self.model.predict(state_next)[0]))
100 |             q_values = self.model.predict(state)
101 |             q_values[0][action] = q_update
102 |             self.model.fit(state, q_values, verbose=0)
103 |         self.exploration_rate *= EXPLORATION_DECAY
104 |         self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)
105 | 
106 | 
107 | def learn_flappyb():
108 |     env = Environment(draw=DRAW, fps=1, debug=False,
109 |                       dist_to_pipe=DIFFICULTY_LEARN,
110 |                       dist_between_pipes=DIST_BETWEEN_PIPES,
111 |                       obs_this_pipe=OBS_THIS_PIPE_LEARN)
112 |     writer = None
113 |     if WRITE:
114 |         writer = SummaryWriter(comment=NAME)
115 |     observation_space = env.get_observation_size_buffer()
116 |     action_space = env.get_action_size()
117 | 
118 |     model = load_model('models/dqn/{}.h5'.format(LOAD_NAME))
119 |     dqn_solver = DQNSolver(observation_space, action_space, model)
120 |     run = 0
121 | 
122 |     if SAVE_MODEL:
123 |         name = '{}-PART={}'.format(NAME, run)
124 |         dqn_solver.model.save('models/dqn/{}.h5'.format(name))
125 |     while True:
126 |         run += 1
127 |         state = env.reset()
128 |         state = np.reshape(state, [1, observation_space])
129 |         step = 0
130 |         reward_score = 0
131 | 
132 |         while True:
133 |             step += 1
134 |             action = dqn_solver.act(state, env)
135 |             state_next, reward, terminal, info = env.step_buffer(action)
136 |             reward_score += reward
137 |             state_next = np.reshape(state_next, [1, observation_space])
138 |             dqn_solver.remember(state, action, reward, state_next, terminal)
139 |             state = state_next
140 |             if terminal:
141 |                 print("Run: " + str(run) + ", exploration: " +
142 |                       str(dqn_solver.exploration_rate) + ", score: " +
143 |                       str(reward_score))
144 |                 if WRITE:
145 |                     writer.add_scalar("reward", reward_score, run)
146 |                 break
147 |             dqn_solver.experience_replay()
148 |         if (run % 100 == 0) and SAVE_MODEL:
149 |             name = '{}-PART={}'.format(NAME, run)
150 |             dqn_solver.model.save('models/dqn/{}.h5'.format(name))
151 |     if WRITE:
152 |         writer.close()
153 | 
154 | 
155 | def play_flappyb():
156 |     env = Environment(draw=True, fps=1, debug=True,
157 |                       dist_to_pipe=DIFFICULTY_PLAY,
158 |                       dist_between_pipes=DIST_BETWEEN_PIPES,
159 |                       obs_this_pipe=OBS_THIS_PIPE_PLAY)
160 | 
161 |     observation_space = env.get_observation_size_buffer()
162 |     action_space = env.get_action_size()
163 | 
164 |     model = keras.models.load_model('models/dqn/{}.h5'.format(LOAD_NAME))
165 |     dqn_solver = DQNSolver(observation_space, action_space, model)
166 | 
167 |     for i in range(20):
168 |         state = env.reset()
169 |         state = np.reshape(state, [1, observation_space])
170 |         is_done = False
171 |         while not is_done:
172 |             action = dqn_solver.act_free(state)
173 |             # action = env.get_action_random()
174 |             state_next, reward, terminal, info = env.step_buffer(action)
175 |             is_done = terminal
176 |             state = np.reshape(state_next, [1, observation_space])
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     if LEARN:
181 |         learn_flappyb()
182 |     else:
183 |         play_flappyb()
184 | 
185 |     print('Jobe Done!')
186 | 


--------------------------------------------------------------------------------
/flappyb/dqn_v3.py:
--------------------------------------------------------------------------------
  1 | from environment.environment import Environment
  2 | import time
  3 | import numpy as np
  4 | import collections
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.optim as optim
  9 | from tensorboardX import SummaryWriter
 10 | 
 11 | NAME = 'flappyb-newdqn'
 12 | GPU = False
 13 | 
 14 | MEAN_REWARD_BOUND = 500000
 15 | 
 16 | GAMMA = 0.99
 17 | BATCH_SIZE = 32
 18 | REPLAY_SIZE = 10000
 19 | LEARNING_RATE = 1e-4
 20 | SYNC_TARGET_FRAMES = 1000
 21 | REPLAY_START_SIZE = 10000
 22 | 
 23 | EPSILON_DECAY_LAST_FRAME = 10**5
 24 | EPSILON_START = 1.0
 25 | EPSILON_FINAL = 0.02
 26 | 
 27 | 
 28 | class DQN(nn.Module):
 29 |     def __init__(self, input_shape, n_actions):
 30 |         super(DQN, self).__init__()
 31 | 
 32 |         self.fc = nn.Sequential(
 33 |             nn.Linear(input_shape, 512),
 34 |             nn.ReLU(),
 35 |             nn.Linear(512, n_actions)
 36 |         )
 37 | 
 38 |     def forward(self, x):
 39 |         return self.fc(x)
 40 | 
 41 | 
 42 | Experience = collections.namedtuple('Experience', field_names=[
 43 |     'state', 'action', 'reward', 'done', 'new_state'])
 44 | 
 45 | 
 46 | class ExperienceBuffer:
 47 |     def __init__(self, capacity):
 48 |         self.buffer = collections.deque(maxlen=capacity)
 49 | 
 50 |     def __len__(self):
 51 |         return len(self.buffer)
 52 | 
 53 |     def append(self, experience):
 54 |         self.buffer.append(experience)
 55 | 
 56 |     def sample(self, batch_size):
 57 |         indices = np.random.choice(len(self.buffer), batch_size, replace=True)
 58 |         states, actions, rewards, dones, next_states = \
 59 |             zip(*[self.buffer[idx] for idx in indices])
 60 |         return np.array(states), np.array(actions), np.array(
 61 |             rewards, dtype=np.float32), np.array(dones, dtype=np.uint8), \
 62 |             np.array(next_states)
 63 | 
 64 | 
 65 | class Agent:
 66 |     def __init__(self, env, exp_buffer):
 67 |         self.env = env
 68 |         self.exp_buffer = exp_buffer
 69 |         self._reset()
 70 | 
 71 |     def _reset(self):
 72 |         self.state = env.reset()
 73 |         self.total_reward = 0.0
 74 | 
 75 |     def play_step(self, net, epsilon=0.0, device="cpu"):
 76 |         done_reward = None
 77 | 
 78 |         if np.random.random() < epsilon:
 79 |             action = env.get_action_random()
 80 |         else:
 81 |             state_a = np.array([self.state], copy=False)
 82 |             state_v = torch.tensor(state_a[0]).to(device)
 83 |             q_vals_v = net(state_v.float())
 84 |             _, act_v = torch.max(q_vals_v, dim=1)
 85 |             action = int(act_v.item())
 86 | 
 87 |         # do step in the environment
 88 |         new_state, reward, is_done, _ = self.env.step_buffer(action)
 89 |         self.total_reward += reward
 90 | 
 91 |         exp = Experience(self.state, action, reward, is_done, new_state)
 92 |         self.exp_buffer.append(exp)
 93 |         self.state = new_state
 94 |         if is_done:
 95 |             done_reward = self.total_reward
 96 |             self._reset()
 97 |         return done_reward
 98 | 
 99 | 
100 | def calc_loss(batch, net, tgt_net, device="cpu"):
101 |     states, actions, rewards, dones, next_states = batch
102 | 
103 |     states_v = torch.tensor(states).to(device)
104 |     next_states_v = torch.tensor(next_states).to(device)
105 |     actions_v = torch.tensor(actions).to(device)
106 |     rewards_v = torch.tensor(rewards).to(device)
107 |     done_mask = torch.ByteTensor(dones).to(device)
108 | 
109 |     state_action_values = net(states_v.float()).gather(
110 |         1, actions_v.unsqueeze(-1)).squeeze(-1)
111 |     next_state_values = tgt_net(next_states_v.float()).max(1)[0]
112 |     next_state_values[done_mask] = 0.0
113 |     next_state_values = next_state_values.detach()
114 | 
115 |     expected_state_action_values = next_state_values * GAMMA + rewards_v
116 | 
117 |     return nn.MSELoss()(state_action_values, expected_state_action_values)
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     device = torch.device("cuda" if GPU else "cpu")
122 | 
123 |     env = Environment(draw=False, fps=1, debug=False,
124 |                       dist_to_pipe=50, dist_between_pipes=180,
125 |                       obs_this_pipe=False)
126 | 
127 |     net = DQN(env.get_observation_size(), env.get_action_size()).to(device)
128 |     tgt_net = DQN(env.get_observation_size(), env.get_action_size()).to(device)
129 |     writer = SummaryWriter(comment="-" + NAME)
130 |     print(net)
131 | 
132 |     buffer = ExperienceBuffer(REPLAY_SIZE)
133 |     agent = Agent(env, buffer)
134 |     epsilon = EPSILON_START
135 | 
136 |     optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
137 |     total_rewards = []
138 |     frame_idx = 0
139 |     ts_frame = 0
140 |     ts = time.time()
141 |     best_mean_reward = None
142 | 
143 |     while True:
144 |         frame_idx += 1
145 |         epsilon = max(EPSILON_FINAL,
146 |                       EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
147 | 
148 |         reward = agent.play_step(net, epsilon, device=device)
149 |         if reward is not None:
150 |             total_rewards.append(reward)
151 |             speed = (frame_idx - ts_frame) / (time.time() - ts)
152 |             ts_frame = frame_idx
153 |             ts = time.time()
154 |             mean_reward = np.mean(total_rewards[-100:])
155 |             print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
156 |                 frame_idx, len(total_rewards), mean_reward, epsilon,
157 |                 speed
158 |             ))
159 |             writer.add_scalar("epsilon", epsilon, frame_idx)
160 |             writer.add_scalar("speed", speed, frame_idx)
161 |             writer.add_scalar("reward_100", mean_reward, frame_idx)
162 |             writer.add_scalar("reward", reward, frame_idx)
163 | 
164 |             if best_mean_reward is None or best_mean_reward < mean_reward:
165 |                 torch.save(net.state_dict(), NAME + "-best.dat")
166 |                 if best_mean_reward is not None:
167 |                     print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
168 |                 best_mean_reward = mean_reward
169 |             if mean_reward > MEAN_REWARD_BOUND:
170 |                 print("Solved in %d frames!" % frame_idx)
171 |                 break
172 | 
173 |         if len(buffer) < REPLAY_START_SIZE:
174 |             continue
175 | 
176 |         if frame_idx % SYNC_TARGET_FRAMES == 0:
177 |             tgt_net.load_state_dict(net.state_dict())
178 | 
179 |         optimizer.zero_grad()
180 |         batch = buffer.sample(BATCH_SIZE)
181 |         loss_t = calc_loss(batch, net, tgt_net, device=device)
182 |         loss_t.backward()
183 |         optimizer.step()
184 |     writer.close()
185 | 


--------------------------------------------------------------------------------
/flappyb/environment/assets/Pong-653x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/Pong-653x400.png


--------------------------------------------------------------------------------
/flappyb/environment/assets/all_fonts_script.py:
--------------------------------------------------------------------------------
 1 | import pygame
 2 | import time
 3 | 
 4 | pygame.init()
 5 | pygame.display.set_caption('display all fonts')
 6 | screen = pygame.display.set_mode((1800, 1200))
 7 | 
 8 | y = 150
 9 | x = 0
10 | mod = 0
11 | 
12 | for font in pygame.font.get_fonts():
13 | 	if font == 'notocoloremoji':
14 | 		continue
15 | 	if font == 'kacstoffice':
16 | 		break
17 | 
18 | 	if mod == 0:
19 | 		x = 200
20 | 	if mod == 1:
21 | 		x = 600
22 | 	if mod == 2:
23 | 		x = 1100
24 | 	if mod == 3:
25 | 		x = 1600
26 | 		y+=25
27 | 		mod = -1
28 | 	mod += 1
29 | 
30 | 	text = pygame.font.SysFont(font, 17).render("{}: GAME 123 !".format(font), True, (255,255,255))
31 | 	screen.blit(text,  (x - text.get_width() // 2, y - text.get_height() // 2))
32 | 
33 | 
34 | 
35 | 
36 | pygame.display.flip()
37 | time.sleep(200)
38 | 


--------------------------------------------------------------------------------
/flappyb/environment/assets/bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/bg.png


--------------------------------------------------------------------------------
/flappyb/environment/assets/bird.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/bird.png


--------------------------------------------------------------------------------
/flappyb/environment/assets/pipe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/pipe.png


--------------------------------------------------------------------------------
/flappyb/environment/assets/pipe_long.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/pipe_long.png


--------------------------------------------------------------------------------
/flappyb/environment/assets/sapcraft.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/environment/assets/sapcraft.jpg


--------------------------------------------------------------------------------
/flappyb/environment/bird.py:
--------------------------------------------------------------------------------
 1 | import pygame
 2 | 
 3 | 
 4 | class Bird():
 5 | 
 6 |     def __init__(self, screen, s_width, s_height, color):
 7 | 
 8 |         self.bird_image = pygame.image.load("environment/assets/bird.png")
 9 |         self.rotate = 1
10 | 
11 |         self.screen = screen
12 |         self.s_width = s_width
13 |         self.s_height = s_height
14 |         self.color = color
15 | 
16 |         self.radius = 20
17 |         # self.radius = 10
18 | 
19 |         self.x = 50
20 |         self.y = int(s_height / 2)
21 | 
22 |         self.vel = 0
23 |         self.gravity = 2  # default is 1
24 | 
25 |         self.bottom = s_height - 20
26 |         self.vel_cap = 20
27 | 
28 |         self.salto = False
29 |         self.rotation = 0
30 |         self.last_rotation = 0
31 |         self.last_reward = 0
32 | 
33 |     def handle_events_human(self):
34 |         keys = pygame.key.get_pressed()
35 |         if keys[pygame.K_SPACE]:
36 |             self._fly()
37 | 
38 |     def handle_events_ai(self, action):
39 |         if action == 1:
40 |             self._fly()
41 |         else:
42 |             pass
43 | 
44 |     def draw(self, reward):
45 | 
46 |         surf = None
47 | 
48 |         if reward % 10 == 0 and reward is not self.last_reward:
49 |             self.last_reward = reward
50 |             self.salto = True
51 | 
52 |         if self.salto:
53 |             if self.last_rotation >= 0:
54 |                 self.rotation += 15
55 |                 surf = pygame.transform.rotate(self.bird_image, self.rotation)
56 |                 if self.rotation == 400:
57 |                     self.salto = False
58 |             else:
59 |                 self.rotation -= 15
60 |                 surf = pygame.transform.rotate(self.bird_image, self.rotation)
61 |                 if self.rotation == -400:
62 |                     self.salto = False
63 | 
64 |         elif self.vel > 0:
65 |             self.rotation = -40
66 |             self.last_rotation = self.rotation
67 |             surf = pygame.transform.rotate(self.bird_image, self.rotation)
68 | 
69 |         else:
70 |             self.rotation = 40
71 |             self.last_rotation = self.rotation
72 |             surf = pygame.transform.rotate(self.bird_image, self.rotation)
73 | 
74 |         self.screen.blit(surf, (self.x - 25, self.y - 20))
75 | 
76 |     def update(self):
77 |         self.vel += self.gravity
78 |         self.y += self.vel
79 | 
80 |         if self.y > self.s_height:
81 |             return True
82 | 
83 |         if self.y < 0:
84 |             self.y = 0
85 | 
86 |         if self.vel > 20:
87 |             self.vel = 20
88 | 
89 |         return False
90 | 
91 |     def _fly(self):
92 |         self.vel += -self.gravity * 2
93 | 
94 |         if self.vel > 20:
95 |             self.vel = 20
96 | 


--------------------------------------------------------------------------------
/flappyb/environment/environment.py:
--------------------------------------------------------------------------------
  1 | # Game was made with the help of https://www.youtube.com/watch?v=cXgA1d_E-jY
  2 | import time
  3 | import pygame
  4 | import gym
  5 | import gym.spaces
  6 | import enum
  7 | 
  8 | import numpy as np
  9 | 
 10 | from environment.bird import Bird
 11 | from environment.pipe import Pipe
 12 | 
 13 | # AI PARAMETERS ##########################################################
 14 | BUFFER_SIZE = 4
 15 | OBSERVATION_SIZE = 5
 16 | ACTIONS = [0, 1]
 17 | ACTION_SIZE = 2
 18 | ROUND_TO_DECIMALS = 2
 19 | 
 20 | # GAME PARAMETERS ########################################################
 21 | SCREEN_SIZE = WIDTH, HEIGHT = (640, 880)
 22 | BACKGROUND = (146, 183, 254)
 23 | BIRD_COLOR = (241, 213, 19)
 24 | PIPE_COLOR = (44, 176, 26)
 25 | FONT = 'dyuthi'
 26 | 
 27 | 
 28 | """
 29 | Interace:
 30 | reset():                resets the whole environment
 31 | step(action):           performs one action onto the environment
 32 | step_buffer(action):    performs one action on the environment,
 33 |                         returns 4 states
 34 | get_action_random():    obtain an imporoved random action
 35 | get_observation_size(): obtain size of observation
 36 | get_action_size():      obtain size of action
 37 | """
 38 | 
 39 | 
 40 | class Actions(enum.Enum):
 41 |     Skip = 0
 42 |     Fly = 1
 43 | 
 44 | 
 45 | class Environment(gym.Env):
 46 | 
 47 |     def __init__(self, draw=True, fps=10, debug=False,
 48 |                  dist_to_pipe=150, dist_between_pipes=220, obs_this_pipe=True):
 49 | 
 50 |         super(Environment, self).__init__()
 51 |         self.observation_space = gym.spaces.Discrete(n=OBSERVATION_SIZE * BUFFER_SIZE)
 52 |         self.action_space = gym.spaces.Discrete(n=len(Actions))
 53 | 
 54 |         self.pipe_image_up = None
 55 |         self.pipe_image_down = None
 56 | 
 57 |         if draw:
 58 |             pygame.init()
 59 |             pygame.display.set_caption('NN FlappyB')
 60 | 
 61 |             self.font_game_over = pygame.font.SysFont("ani", 72)
 62 |             self.bg = pygame.image.load("environment/assets/bg.png")
 63 | 
 64 |             self.pipe_image_up = pygame.image.load(
 65 |                 "environment/assets/pipe.png")  # 52x808
 66 |             self.pipe_image_down = pygame.image.load(
 67 |                 "environment/assets/pipe_long.png")  # 52x808<
 68 | 
 69 |         self.dist_between_pipes = dist_between_pipes
 70 | 
 71 |         self.fps = fps
 72 |         self.debug = debug
 73 |         self.draw = draw
 74 |         self.dist_to_pipe = dist_to_pipe
 75 |         self.obs_this_pipe = obs_this_pipe
 76 | 
 77 |         self.clock = pygame.time.Clock()
 78 |         self.time_elapsed_since_last_action = 0
 79 |         self.global_time = 0
 80 | 
 81 |         self.screen = pygame.display.set_mode(SCREEN_SIZE)
 82 | 
 83 |         self.bird = Bird(self.screen, WIDTH, HEIGHT, BIRD_COLOR)
 84 |         self.pipes = [Pipe(self.screen, WIDTH, HEIGHT,
 85 |                            PIPE_COLOR, self.dist_between_pipes,
 86 |                            self.pipe_image_up, self.pipe_image_down)]
 87 | 
 88 |         self.reward = 0
 89 |         self.is_done = False
 90 |         self.printed_score = False
 91 | 
 92 |     # ML INTERFACE ###########################################################
 93 |     def reset(self):
 94 | 
 95 |         self.clock = pygame.time.Clock()
 96 |         self.time_elapsed_since_last_action = 0
 97 |         self.global_time = 0
 98 | 
 99 |         self.bird = Bird(self.screen, WIDTH, HEIGHT, BIRD_COLOR)
100 |         self.pipes = [Pipe(self.screen, WIDTH, HEIGHT,
101 |                            PIPE_COLOR, self.dist_between_pipes,
102 |                            self.pipe_image_up, self.pipe_image_down)]
103 | 
104 |         self.reward = 0
105 |         self.is_done = False
106 |         self.printed_score = False
107 | 
108 |         obs, reward, is_done, _ = self.step(0)
109 | 
110 |         return obs
111 | 
112 |     # def step(self, action):
113 | 
114 |     #     while not self.time_elapsed_since_last_action > self.fps:
115 |     #         dt = self.clock.tick()
116 |     #         self.time_elapsed_since_last_action += dt
117 | 
118 |     #     self.global_time += 1
119 | 
120 |     #     obs, rew, d, _ = self.run_ai_game_step(action)
121 | 
122 |     #     if rew >= 1:
123 |     #         rew = 0.25
124 |     #     elif rew <= -1:
125 |     #         rew = -0.25
126 |     #     else:
127 |     #         rew = 0.025
128 | 
129 |     #     return obs, rew, d, _
130 | 
131 |     def step(self, action):
132 | 
133 |         if isinstance(action, np.ndarray):
134 |             if action[0] > action[1]:
135 |                 action = 0
136 |             else:
137 |                 action = 1
138 | 
139 |         obs = []
140 |         rew = 0
141 | 
142 |         for i in range(BUFFER_SIZE):
143 |             while not self.time_elapsed_since_last_action > self.fps:
144 |                 dt = self.clock.tick()
145 |                 self.time_elapsed_since_last_action += dt
146 | 
147 |             self.global_time += 1
148 |             o, r, d, _ = self.run_ai_game_step(action)
149 |             rew += r
150 | 
151 |             for j in range(len(o)):
152 |                 obs.append(o[j])
153 | 
154 |         if rew > 1:
155 |             rew = 1
156 |         elif rew < -1:
157 |             rew = -1
158 |         else:
159 |             rew = 0.1
160 | 
161 |         obs = np.array(obs)
162 | 
163 |         return obs, rew, d, _
164 | 
165 |     # The actual game step ###################################################
166 |     def run_ai_game_step(self, action):
167 | 
168 |         current_reward = 0.1
169 | 
170 |         if self.global_time % self.dist_to_pipe == 0:
171 |             self.pipes.append(Pipe(self.screen, WIDTH, HEIGHT,
172 |                                    PIPE_COLOR, self.dist_between_pipes,
173 |                                    self.pipe_image_up, self.pipe_image_down))
174 | 
175 |         for pipe in self.pipes:
176 |             pipe.update()
177 | 
178 |             if pipe.off_screen():
179 |                 self.pipes.remove(pipe)
180 | 
181 |             if pipe.hits(self.bird):
182 |                 self.game_over()
183 |                 current_reward = -1
184 |                 # hit_pipe = True
185 | 
186 |             if pipe.behind(self.bird):
187 |                 self.reward += 1
188 |                 current_reward = 1
189 | 
190 |         self.bird.handle_events_ai(action)
191 |         if self.bird.update():
192 |             self.game_over()
193 |             current_reward = -1
194 | 
195 |         if self.draw:
196 |             # self.screen.fill(BACKGROUND)
197 |             self.screen.blit(self.bg, (0, 0))
198 |             for pipe in self.pipes:
199 |                 pipe.draw()
200 |             self.bird.draw(self.reward)
201 |             text = pygame.font.SysFont(FONT, 28).render(
202 |                 "SCORE {}".format(self.reward), True, (0, 0, 0))
203 |             self.screen.blit(text, (565 - text.get_width() //
204 |                                     2, 30 - text.get_height() // 2))
205 |             pygame.display.flip()
206 | 
207 |         obs = self.get_observation_space()
208 | 
209 |         if self.draw:
210 |             pygame.display.update()
211 | 
212 |         self.time_elapsed_since_last_action = 0
213 | 
214 |         return obs, current_reward, self.is_done, None
215 |     ##########################################################################
216 | 
217 |     def get_observation_space(self):
218 | 
219 |         my_pipe = Pipe(self.screen, WIDTH, HEIGHT, PIPE_COLOR, 220, None, None)
220 |         my_pipe.x = 9999
221 | 
222 |         if self.obs_this_pipe:
223 |             for pipe in self.pipes:
224 |                 if (pipe.x < my_pipe.x) and pipe.x >= (self.bird.x - pipe.width):
225 |                     my_pipe = pipe
226 |         else:
227 |             for pipe in self.pipes:
228 |                 if (pipe.x < my_pipe.x) and pipe.x >= (self.bird.x):  # target next pipe immediately
229 |                     my_pipe = pipe
230 | 
231 |         e1 = self.bird.y                    # bird pos
232 |         e2 = self.bird.vel                  # bird vel
233 |         e3 = my_pipe.x - self.bird.x        # dist to Pipe
234 |         e4 = my_pipe.top                    # pipe top
235 |         e5 = my_pipe.bot                    # pipe bot
236 | 
237 |         if self.draw and self.debug:
238 |             e_d1 = pygame.rect.Rect(self.bird.x, e1, 2, HEIGHT - e1)
239 |             pygame.draw.rect(self.screen, (255, 0, 0), e_d1)
240 | 
241 |             e_d2 = pygame.rect.Rect(self.bird.x - self.bird.radius,
242 |                                     e2 * 2 + HEIGHT / 2, self.bird.x + self.bird.radius, 5)
243 |             pygame.draw.rect(self.screen, (255, 0, 0), e_d2)
244 | 
245 |             e_d3 = pygame.rect.Rect(self.bird.x, self.bird.y, e3, 2)
246 |             pygame.draw.rect(self.screen, (255, 0, 0), e_d3)
247 | 
248 |             e_d4 = pygame.rect.Rect(my_pipe.x - 5, e4, my_pipe.width + 10, 5)
249 |             pygame.draw.rect(self.screen, (255, 0, 0), e_d4)
250 | 
251 |             e_d5 = pygame.rect.Rect(my_pipe.x - 5, e5, my_pipe.width + 10, 5)
252 |             pygame.draw.rect(self.screen, (255, 0, 0), e_d5)
253 | 
254 |         # Normalization ###
255 |         e1 = e1 / HEIGHT
256 |         e2 = e2 / self.bird.vel_cap
257 |         e3 = e3 / (WIDTH - 50)
258 |         e4 = e4 / HEIGHT
259 |         e5 = e5 / HEIGHT
260 | 
261 |         # Nomralizatoin with rounding
262 |         # e1 = round(e1, ROUND_TO_DECIMALS)
263 |         # e2 = round(e2, ROUND_TO_DECIMALS)
264 |         # e3 = round(e3, ROUND_TO_DECIMALS)
265 |         # e4 = round(e4, ROUND_TO_DECIMALS)
266 |         # e5 = round(e5, ROUND_TO_DECIMALS)
267 | 
268 |         obs = (e1, e2, e3, e4, e5)
269 |         # print(obs)
270 | 
271 |         return obs
272 | 
273 |     def get_action_random(self):
274 |         action = np.random.choice((0, 1), 1, p=(0.45, 0.55))
275 |         return action.item(0)
276 | 
277 |     def get_observation_size(self):
278 |         return OBSERVATION_SIZE
279 | 
280 |     def get_observation_size_buffer(self):
281 |         return OBSERVATION_SIZE * BUFFER_SIZE
282 | 
283 |     def get_actions(self):
284 |         return ACTIONS
285 | 
286 |     def get_action_size(self):
287 |         return ACTION_SIZE
288 | 
289 |     def game_over(self):
290 |         if not self.printed_score:
291 |             # print('Score: {}'.format(self.reward))
292 |             self.printed_score = True
293 | 
294 |         if self.draw:
295 |             text = pygame.font.SysFont(FONT, 28).render(
296 |                 "Game Over!".format(self.reward), True, (0, 0, 0))
297 |             self.screen.blit(text, (320 - text.get_width() //
298 |                                     2, 240 - text.get_height() // 2))
299 |             pygame.display.flip()
300 |             time.sleep(0.4)
301 |         self.is_done = True
302 | 
303 |     # HUMAN STUFF ################################################
304 | 
305 |     def run_human_game(self):
306 | 
307 |         if self.draw:
308 |             for _ in range(3,0,-1):
309 |                 self.screen.blit(self.bg, (0, 0))
310 |                 self.bird.draw(self.reward)
311 |                 text_start = pygame.font.SysFont(FONT, 80).render(
312 |                             "Start in  {}".format(_), True, (0, 0, 0))
313 |                 self.screen.blit(text_start, (text_start.get_width() //
314 |                                                 2, text_start.get_height() // 2))
315 |                 pygame.display.flip()
316 |                 time.sleep(0.3)
317 | 
318 |         while not self.is_done:
319 | 
320 |             while not self.time_elapsed_since_last_action > self.fps:
321 |                 dt = self.clock.tick()
322 |                 self.time_elapsed_since_last_action += dt
323 | 
324 |             self.global_time += 1
325 | 
326 |             self.screen.fill(BACKGROUND)
327 |             self.handle_events_human()
328 | 
329 |             current_reward = 0.1
330 | 
331 |             if self.global_time % self.dist_to_pipe == 0:
332 |                 self.pipes.append(Pipe(
333 |                     self.screen, WIDTH, HEIGHT, PIPE_COLOR, self.dist_between_pipes, self.pipe_image_up, self.pipe_image_down))
334 | 
335 |             for pipe in self.pipes:
336 |                 pipe.update()
337 | 
338 |                 if pipe.off_screen():
339 |                     self.pipes.remove(pipe)
340 | 
341 |                 if pipe.hits(self.bird):
342 |                     self.game_over()
343 |                     current_reward = -1
344 | 
345 |                 if pipe.behind(self.bird):
346 |                     self.reward += 1
347 |                     current_reward = 1
348 | 
349 |             self.bird.handle_events_human()
350 |             if self.bird.update():
351 |                 self.game_over()
352 |                 current_reward = -1
353 | 
354 |             if self.draw:
355 | 
356 |                 self.screen.blit(self.bg, (0, 0))
357 |                 for pipe in self.pipes:
358 |                     pipe.draw()
359 |                 self.bird.draw(self.reward)
360 |                 text = pygame.font.SysFont(FONT, 28).render(
361 |                     "SCORE {}".format(self.reward), True, (0, 0, 0))
362 |                 self.screen.blit(text, (565 - text.get_width() //
363 |                                         2, 30 - text.get_height() // 2))
364 |                 pygame.display.flip()
365 | 
366 |             # if self.draw:
367 |             #     self.screen.fill(BACKGROUND)
368 |             #     for pipe in self.pipes:
369 |             #         pipe.draw()
370 |             #     self.bird.draw(self.reward)
371 | 
372 |             obs = self.get_observation_space()
373 | 
374 |             # if self.draw:
375 |             #     pygame.display.update()
376 | 
377 |             self.time_elapsed_since_last_action = 0
378 |             # print(current_reward)
379 | 
380 |     def handle_events_human(self):
381 |         for event in pygame.event.get():
382 |             if event.type == pygame.QUIT:
383 |                 self.is_done = False
384 |                 pygame.quit()
385 | 


--------------------------------------------------------------------------------
/flappyb/environment/pipe.py:
--------------------------------------------------------------------------------
 1 | import pygame
 2 | import numpy as np
 3 | 
 4 | RANDOM_PIPES = [150, 250, 350, 450, 550]
 5 | 
 6 | 
 7 | class Pipe:
 8 | 
 9 |     def __init__(self, screen, s_width, s_height, color, dist_between_pipes=220,pipe_image_up=None, pipe_image_down=None):
10 | 
11 |         self.pipe_image_up = pipe_image_up  # 52x808
12 |         self.pipe_image_down = pipe_image_down  # 52x808
13 | 
14 |         self.screen = screen
15 |         self.s_width = s_width
16 |         self.s_height = s_height
17 |         self.color = color
18 | 
19 |         self.top = np.random.choice(RANDOM_PIPES)
20 |         self.bot = self.top + dist_between_pipes
21 | 
22 |         # self.top = random.randrange(120, s_height-370)
23 |         # self.bot = self.top + 350
24 | 
25 |         self.width = 52
26 |         self.speed = 3
27 |         self.x = s_width
28 |         self.within_pipe = False
29 | 
30 |     def draw(self):
31 |         # rect_top = pygame.rect.Rect(self.x, 0, self.width, self.top)
32 |         # rect_bot = pygame.rect.Rect(self.x, self.bot, self.width, self.s_height)
33 |         # pygame.draw.rect(self.screen, self.color, rect_top)
34 |         # pygame.draw.rect(self.screen, self.color, rect_bot)
35 | 
36 |         if self.top > 320:
37 |             pipe_rotated = pygame.transform.rotate(self.pipe_image_up, 180)
38 |             self.screen.blit(pipe_rotated, (self.x, self.top - 320))
39 |             pipe_rotated_long = pygame.transform.rotate(
40 |                 self.pipe_image_down, 180)
41 |             self.screen.blit(pipe_rotated_long, (self.x, self.top - 320 - 280))
42 |         else:
43 |             pipe_rotated = pygame.transform.rotate(self.pipe_image_up, 180)
44 |             self.screen.blit(pipe_rotated, (self.x, self.top - 320))
45 | 
46 |         if self.s_height - self.bot > 320:
47 |             self.screen.blit(self.pipe_image_up, (self.x, self.bot))
48 |             self.screen.blit(self.pipe_image_down, (self.x, self.bot + 280))
49 |         else:
50 |             self.screen.blit(self.pipe_image_up, (self.x, self.bot))
51 | 
52 |     def update(self):
53 |         self.x -= self.speed
54 | 
55 |     def hits(self, bird):
56 |         if bird.y < self.top or bird.y > self.bot:
57 |             if self.x < bird.x < self.x + self.width:
58 |                 return True
59 | 
60 |     def behind(self, bird):
61 |         if bird.x > self.x + self.width and not self.within_pipe:
62 |             self.within_pipe = True
63 |             return True
64 |         if bird.x < self.x + self.width:
65 |             self.within_pipe = False
66 | 
67 |     def off_screen(self):
68 |         return self.x + self.width + 5 < 0
69 | 


--------------------------------------------------------------------------------
/flappyb/lib/common.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | HYPERPARAMS = {
  9 |     'flappyb': {
 10 |         'stop_reward':      100.0,
 11 |         'run_name':         'flappyb',
 12 |         'replay_size':      100000,
 13 |         'replay_initial':   10000,
 14 |         'target_net_sync':  1000,
 15 |         'epsilon_frames':   10**5,
 16 |         'epsilon_start':    1.0,
 17 |         'epsilon_final':    0.02,
 18 |         'learning_rate':    0.001,
 19 |         'gamma':            0.9,
 20 |         'batch_size':       32
 21 |     }
 22 | }
 23 | 
 24 | 
 25 | def unpack_batch(batch):
 26 |     states, actions, rewards, dones, last_states = [], [], [], [], []
 27 |     for exp in batch:
 28 |         state = np.array(exp.state, copy=False)
 29 |         states.append(state)
 30 |         actions.append(exp.action)
 31 |         rewards.append(exp.reward)
 32 |         dones.append(exp.last_state is None)
 33 |         if exp.last_state is None:
 34 |             last_states.append(state)       # the result will be masked anyway
 35 |         else:
 36 |             last_states.append(np.array(exp.last_state, copy=False))
 37 |     return np.array(states, copy=False), np.array(actions), np.array(rewards, dtype=np.float32), \
 38 |            np.array(dones, dtype=np.uint8), np.array(last_states, copy=False)
 39 | 
 40 | 
 41 | def calc_loss_dqn(batch, net, tgt_net, gamma, device="cpu"):
 42 |     states, actions, rewards, dones, next_states = unpack_batch(batch)
 43 | 
 44 |     states_v = torch.tensor(states).to(device)
 45 |     next_states_v = torch.tensor(next_states).to(device)
 46 |     actions_v = torch.tensor(actions).to(device)
 47 |     rewards_v = torch.tensor(rewards).to(device)
 48 |     done_mask = torch.ByteTensor(dones).to(device)
 49 | 
 50 |     state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
 51 |     next_state_values = tgt_net(next_states_v).max(1)[0]
 52 |     next_state_values[done_mask] = 0.0
 53 | 
 54 |     expected_state_action_values = next_state_values.detach() * gamma + rewards_v
 55 |     return nn.MSELoss()(state_action_values, expected_state_action_values)
 56 | 
 57 | 
 58 | class RewardTracker:
 59 |     def __init__(self, name, net, writer, stop_reward):
 60 |         self.writer = writer
 61 |         self.stop_reward = stop_reward
 62 |         self.net = net
 63 |         self.name = name
 64 |         self.best_reward = -1
 65 | 
 66 |     def __enter__(self):
 67 |         self.ts = time.time()
 68 |         self.ts_frame = 0
 69 |         self.total_rewards = []
 70 |         return self
 71 | 
 72 |     def __exit__(self, *args):
 73 |         if self.writer != None:
 74 |             self.writer.close()
 75 | 
 76 |     def reward(self, reward, frame, epsilon=None):
 77 |         self.total_rewards.append(reward)
 78 |         speed = (frame - self.ts_frame) / (time.time() - self.ts)
 79 |         self.ts_frame = frame
 80 |         self.ts = time.time()
 81 |         mean_reward = np.mean(self.total_rewards[-100:])
 82 |         epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
 83 |         print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % (
 84 |             frame, len(self.total_rewards), mean_reward, speed, epsilon_str
 85 |         ))
 86 |         sys.stdout.flush()
 87 |         if self.writer != None:
 88 |             if epsilon is not None:
 89 |                 self.writer.add_scalar("epsilon", epsilon, frame)
 90 |             self.writer.add_scalar("speed", speed, frame)
 91 |             # self.writer.add_scalar("reward_100", mean_reward, frame)
 92 |             self.writer.add_scalar("reward", reward, frame)
 93 |         if reward > self.best_reward:
 94 |             self.best_reward = reward
 95 |             torch.save(self.net.state_dict(), 'models/' + self.name + str(reward))
 96 |             print("\tNew best reward = ", str(reward))
 97 |         if mean_reward > self.stop_reward:
 98 |             print("Solved in %d frames!" % frame)
 99 |             return True
100 |         return False
101 | 
102 | 
103 | class EpsilonTracker:
104 |     def __init__(self, epsilon_greedy_selector, params):
105 |         self.epsilon_greedy_selector = epsilon_greedy_selector
106 |         self.epsilon_start = params['epsilon_start']
107 |         self.epsilon_final = params['epsilon_final']
108 |         self.epsilon_frames = params['epsilon_frames']
109 |         self.frame(0)
110 | 
111 |     def frame(self, frame):
112 |         self.epsilon_greedy_selector.epsilon = \
113 |             max(self.epsilon_final, self.epsilon_start - frame / self.epsilon_frames)
114 | 
115 | 
116 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma):
117 |     """
118 |     Perform distribution projection aka Catergorical Algorithm from the
119 |     "A Distributional Perspective on RL" paper
120 |     """
121 |     batch_size = len(rewards)
122 |     proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32)
123 |     delta_z = (Vmax - Vmin) / (n_atoms - 1)
124 |     for atom in range(n_atoms):
125 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma))
126 |         b_j = (tz_j - Vmin) / delta_z
127 |         l = np.floor(b_j).astype(np.int64)
128 |         u = np.ceil(b_j).astype(np.int64)
129 |         eq_mask = u == l
130 |         proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom]
131 |         ne_mask = u != l
132 |         proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask]
133 |         proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask]
134 |     if dones.any():
135 |         proj_distr[dones] = 0.0
136 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones]))
137 |         b_j = (tz_j - Vmin) / delta_z
138 |         l = np.floor(b_j).astype(np.int64)
139 |         u = np.ceil(b_j).astype(np.int64)
140 |         eq_mask = u == l
141 |         eq_dones = dones.copy()
142 |         eq_dones[dones] = eq_mask
143 |         if eq_dones.any():
144 |             proj_distr[eq_dones, l[eq_mask]] = 1.0
145 |         ne_mask = u != l
146 |         ne_dones = dones.copy()
147 |         ne_dones[dones] = ne_mask
148 |         if ne_dones.any():
149 |             proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask]
150 |             proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask]
151 |     return proj_distr


--------------------------------------------------------------------------------
/flappyb/lib/dqn_model.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | class NoisyLinear(nn.Linear):
10 |     def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
11 |         super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
12 |         self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
13 |         self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features))
14 |         if bias:
15 |             self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
16 |             self.register_buffer("epsilon_bias", torch.zeros(out_features))
17 |         self.reset_parameters()
18 | 
19 |     def reset_parameters(self):
20 |         std = math.sqrt(3 / self.in_features)
21 |         self.weight.data.uniform_(-std, std)
22 |         self.bias.data.uniform_(-std, std)
23 | 
24 |     def forward(self, input):
25 |         self.epsilon_weight.normal_()
26 |         bias = self.bias
27 |         if bias is not None:
28 |             self.epsilon_bias.normal_()
29 |             bias = bias + self.sigma_bias * self.epsilon_bias.data
30 |         return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias)
31 | 
32 | 
33 | class NoisyFactorizedLinear(nn.Linear):
34 |     """
35 |     NoisyNet layer with factorized gaussian noise
36 | 
37 |     N.B. nn.Linear already initializes weight and bias to
38 |     """
39 |     def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True):
40 |         super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias)
41 |         sigma_init = sigma_zero / math.sqrt(in_features)
42 |         self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
43 |         self.register_buffer("epsilon_input", torch.zeros(1, in_features))
44 |         self.register_buffer("epsilon_output", torch.zeros(out_features, 1))
45 |         if bias:
46 |             self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
47 | 
48 |     def forward(self, input):
49 |         self.epsilon_input.normal_()
50 |         self.epsilon_output.normal_()
51 | 
52 |         func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x))
53 |         eps_in = func(self.epsilon_input.data)
54 |         eps_out = func(self.epsilon_output.data)
55 | 
56 |         bias = self.bias
57 |         if bias is not None:
58 |             bias = bias + self.sigma_bias * eps_out.t()
59 |         noise_v = torch.mul(eps_in, eps_out)
60 |         return F.linear(input, self.weight + self.sigma_weight * noise_v, bias)
61 | 
62 | 
63 | class DQN(nn.Module):
64 |     def __init__(self, input_shape, n_actions):
65 |         super(DQN, self).__init__()
66 | 
67 |         self.fc = nn.Sequential(
68 |             nn.Linear(input_shape, 512),
69 |             nn.ReLU(),
70 |             nn.Linear(512, n_actions)
71 |         )
72 | 
73 |     def forward(self, x):
74 |         fx = x.float() / 256
75 |         return self.fc(fx)


--------------------------------------------------------------------------------
/flappyb/lib/dqn_rainbow.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from environment.environment import Environment
  3 | import ptan
  4 | import argparse
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | import torch.optim as optim
 12 | 
 13 | from tensorboardX import SummaryWriter
 14 | 
 15 | import dqn as dqn_model
 16 | import common
 17 | 
 18 | # n-step
 19 | REWARD_STEPS = 2
 20 | 
 21 | # priority replay
 22 | PRIO_REPLAY_ALPHA = 0.6
 23 | BETA_START = 0.4
 24 | BETA_FRAMES = 100000
 25 | 
 26 | # C51
 27 | Vmax = 10
 28 | Vmin = -10
 29 | N_ATOMS = 51
 30 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
 31 | 
 32 | 
 33 | class RainbowDQN(nn.Module):
 34 |     def __init__(self, input_shape, n_actions):
 35 |         super(RainbowDQN, self).__init__()
 36 | 
 37 |         self.fc_val = nn.Sequential(
 38 |             dqn_model.NoisyLinear(input_shape[0], 256),
 39 |             nn.ReLU(),
 40 |             dqn_model.NoisyLinear(256, N_ATOMS)
 41 |         )
 42 | 
 43 |         self.fc_adv = nn.Sequential(
 44 |             dqn_model.NoisyLinear(input_shape[0], 256),
 45 |             nn.ReLU(),
 46 |             dqn_model.NoisyLinear(256, n_actions * N_ATOMS)
 47 |         )
 48 | 
 49 |         self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z))
 50 |         self.softmax = nn.Softmax(dim=1)
 51 | 
 52 |     def forward(self, x):
 53 |         batch_size = x.size()[0]
 54 |         fx = x.float() / 256
 55 |         val_out = self.fc_val(fx).view(batch_size, 1, N_ATOMS)
 56 |         adv_out = self.fc_adv(fx).view(batch_size, -1, N_ATOMS)
 57 |         adv_mean = adv_out.mean(dim=1, keepdim=True)
 58 |         return val_out + (adv_out - adv_mean)
 59 | 
 60 |     def both(self, x):
 61 |         cat_out = self(x)
 62 |         probs = self.apply_softmax(cat_out)
 63 |         weights = probs * self.supports
 64 |         res = weights.sum(dim=2)
 65 |         return cat_out, res
 66 | 
 67 |     def qvals(self, x):
 68 |         return self.both(x)[1]
 69 | 
 70 |     def apply_softmax(self, t):
 71 |         return self.softmax(t.view(-1, N_ATOMS)).view(t.size())
 72 | 
 73 | 
 74 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
 75 |     states, actions, rewards, dones, next_states = common.unpack_batch(batch)
 76 |     batch_size = len(batch)
 77 | 
 78 |     states_v = torch.tensor(states).to(device)
 79 |     actions_v = torch.tensor(actions).to(device)
 80 |     next_states_v = torch.tensor(next_states).to(device)
 81 |     batch_weights_v = torch.tensor(batch_weights).to(device)
 82 | 
 83 |     # next state distribution
 84 |     # dueling arch -- actions from main net, distr from tgt_net
 85 | 
 86 |     # calc at once both next and cur states
 87 |     distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
 88 |     next_qvals_v = qvals_v[batch_size:]
 89 |     distr_v = distr_v[:batch_size]
 90 | 
 91 |     next_actions_v = next_qvals_v.max(1)[1]
 92 |     next_distr_v = tgt_net(next_states_v)
 93 |     next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
 94 |     next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
 95 |     next_best_distr = next_best_distr_v.data.cpu().numpy()
 96 | 
 97 |     dones = dones.astype(np.bool)
 98 | 
 99 |     # project our distribution using Bellman update
100 |     proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma)
101 | 
102 |     # calculate net output
103 |     state_action_values = distr_v[range(batch_size), actions_v.data]
104 |     state_log_sm_v = F.log_softmax(state_action_values, dim=1)
105 |     proj_distr_v = torch.tensor(proj_distr).to(device)
106 | 
107 |     loss_v = -state_log_sm_v * proj_distr_v
108 |     loss_v = batch_weights_v * loss_v.sum(dim=1)
109 |     return loss_v.mean(), loss_v + 1e-5
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     params = common.HYPERPARAMS['pong']
114 |     params['epsilon_frames'] *= 2
115 |     parser = argparse.ArgumentParser()
116 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
117 |     args = parser.parse_args()
118 |     device = torch.device("cuda" if args.cuda else "cpu")
119 | 
120 |     env = gym.make(params['env_name'])
121 |     env = ptan.common.wrappers.wrap_dqn(env)
122 | 
123 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow")
124 |     net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device)
125 |     tgt_net = ptan.agent.TargetNet(net)
126 |     agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device)
127 | 
128 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS)
129 |     buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
130 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
131 | 
132 |     frame_idx = 0
133 |     beta = BETA_START
134 | 
135 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
136 |         while True:
137 |             frame_idx += 1
138 |             buffer.populate(1)
139 |             beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)
140 | 
141 |             new_rewards = exp_source.pop_total_rewards()
142 |             if new_rewards:
143 |                 if reward_tracker.reward(new_rewards[0], frame_idx):
144 |                     break
145 | 
146 |             if len(buffer) < params['replay_initial']:
147 |                 continue
148 | 
149 |             optimizer.zero_grad()
150 |             batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta)
151 |             loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model,
152 |                                                params['gamma'] ** REWARD_STEPS, device=device)
153 |             loss_v.backward()
154 |             optimizer.step()
155 |             buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy())
156 | 
157 |             if frame_idx % params['target_net_sync'] == 0:
158 |                 tgt_net.sync()
159 | 


--------------------------------------------------------------------------------
/flappyb/lib/ppo_model.py:
--------------------------------------------------------------------------------
 1 | import ptan
 2 | import numpy as np
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | HID_SIZE = 64
 7 | 
 8 | 
 9 | class ModelActor(nn.Module):
10 |     def __init__(self, obs_size, act_size):
11 |         super(ModelActor, self).__init__()
12 | 
13 |         self.mu = nn.Sequential(
14 |             nn.Linear(obs_size, HID_SIZE),
15 |             nn.Tanh(),
16 |             nn.Linear(HID_SIZE, HID_SIZE),
17 |             nn.Tanh(),
18 |             nn.Linear(HID_SIZE, act_size),
19 |             nn.Tanh(),
20 |         )
21 |         self.logstd = nn.Parameter(torch.zeros(act_size))
22 | 
23 |     def forward(self, x):
24 |         return self.mu(x)
25 | 
26 | 
27 | class ModelCritic(nn.Module):
28 |     def __init__(self, obs_size):
29 |         super(ModelCritic, self).__init__()
30 | 
31 |         self.value = nn.Sequential(
32 |             nn.Linear(obs_size, HID_SIZE),
33 |             nn.ReLU(),
34 |             nn.Linear(HID_SIZE, HID_SIZE),
35 |             nn.ReLU(),
36 |             nn.Linear(HID_SIZE, 1),
37 |         )
38 | 
39 |     def forward(self, x):
40 |         return self.value(x)
41 | 
42 | 
43 | class AgentA2C(ptan.agent.BaseAgent):
44 |     def __init__(self, net, device="cpu"):
45 |         self.net = net
46 |         self.device = device
47 | 
48 |     def __call__(self, states, agent_states):
49 |         states_v = ptan.agent.float32_preprocessor(states).to(self.device)
50 | 
51 |         mu_v = self.net(states_v)
52 |         mu = mu_v.data.cpu().numpy()
53 |         logstd = self.net.logstd.data.cpu().numpy()
54 |         actions = mu + np.exp(logstd) * np.random.normal(size=logstd.shape)
55 |         actions = np.clip(actions, -1, 1)
56 |         return actions, agent_states
57 | 


--------------------------------------------------------------------------------
/flappyb/models/cross_entropy/batchsize=100-hiddensize=256-lr=0.01-gamma=.9-PART=240.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/cross_entropy/batchsize=100-hiddensize=256-lr=0.01-gamma=.9-PART=240.pt


--------------------------------------------------------------------------------
/flappyb/models/dqn/dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-LOADED=HARDCORE-6300-lrMax=0.4-nextPipe-HELL-PART=1000.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/dqn/dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-LOADED=HARDCORE-6300-lrMax=0.4-nextPipe-HELL-PART=1000.h5


--------------------------------------------------------------------------------
/flappyb/models/dqn/dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=6650.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/dqn/dqn-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=6650.h5


--------------------------------------------------------------------------------
/flappyb/models/dqn/dqn-expdecay=0.999995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-HARDCORE-PART=6300.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/dqn/dqn-expdecay=0.999995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-HARDCORE-PART=6300.h5


--------------------------------------------------------------------------------
/flappyb/models/flappyb-test-the-rainbow254:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/flappyb-test-the-rainbow254


--------------------------------------------------------------------------------
/flappyb/models/flappyb-test-the-rainbow350:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/flappyb-test-the-rainbow350


--------------------------------------------------------------------------------
/flappyb/models/flappyb-test-the-rainbow87:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/models/flappyb-test-the-rainbow87


--------------------------------------------------------------------------------
/flappyb/play_dqn_rainbow.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from environment.environment import Environment
 3 | import time
 4 | import numpy as np
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | import collections
10 | 
11 | import ptan
12 | from lib import dqn_model
13 | 
14 | 
15 | MODEL_NAME = "flappyb-test-the-rainbow350"
16 | Vmax = 10
17 | Vmin = -10
18 | N_ATOMS = 51
19 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
20 | NUMBER_NEURONS = 512
21 | 
22 | 
23 | class RainbowDQN(nn.Module):
24 |     def __init__(self, input_shape, n_actions):
25 |         super(RainbowDQN, self).__init__()
26 | 
27 |         self.fc_val = nn.Sequential(
28 |             dqn_model.NoisyLinear(input_shape, NUMBER_NEURONS),
29 |             nn.ReLU(),
30 |             dqn_model.NoisyLinear(NUMBER_NEURONS, N_ATOMS)
31 |         )
32 | 
33 |         self.fc_adv = nn.Sequential(
34 |             dqn_model.NoisyLinear(input_shape, NUMBER_NEURONS),
35 |             nn.ReLU(),
36 |             dqn_model.NoisyLinear(NUMBER_NEURONS, n_actions * N_ATOMS)
37 |         )
38 | 
39 |         self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z))
40 |         self.softmax = nn.Softmax(dim=1)
41 | 
42 |     def forward(self, x):
43 |         batch_size = x.size()[0]
44 |         fx = x.float() / NUMBER_NEURONS
45 |         val_out = self.fc_val(fx).view(batch_size, 1, N_ATOMS)
46 |         adv_out = self.fc_adv(fx).view(batch_size, -1, N_ATOMS)
47 |         adv_mean = adv_out.mean(dim=1, keepdim=True)
48 |         return val_out + (adv_out - adv_mean)
49 | 
50 |     def both(self, x):
51 |         cat_out = self(x)
52 |         probs = self.apply_softmax(cat_out)
53 |         weights = probs * self.supports
54 |         res = weights.sum(dim=2)
55 |         return cat_out, res
56 | 
57 |     def qvals(self, x):
58 |         return self.both(x)[1]
59 | 
60 |     def apply_softmax(self, t):
61 |         return self.softmax(t.view(-1, N_ATOMS)).view(t.size())
62 | 
63 | 
64 | if __name__ == "__main__":
65 | 
66 |     env = Environment(draw=True, fps=1, debug=True,
67 |                       dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True)
68 | 
69 |     net = RainbowDQN(env.observation_space.n, env.action_space.n)
70 |     net.load_state_dict(torch.load("models/" + MODEL_NAME, map_location=lambda storage, loc: storage))    
71 | 
72 |     agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector())
73 | 
74 |     for i in range(10):
75 |         state = env.reset()
76 |         total_reward = 0.0
77 |         c = collections.Counter()
78 | 
79 |         while True:
80 |             start_ts = time.time()
81 |             state_v = torch.tensor(np.array([state], copy=False))
82 |             q_vals = agent(state_v) #.data.numpy()[0]
83 |             action = q_vals[0][0]
84 |             print(action)
85 |             
86 |             c[action] += 1
87 |             state, reward, done, _ = env.step(action)
88 |             total_reward += reward
89 |             if done:
90 |                 break
91 |     print("Total reward: %.2f" % total_reward)
92 |     print("Action counts:", c)
93 | 


--------------------------------------------------------------------------------
/flappyb/play_ppo.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import ptan
 4 | 
 5 | from lib import ppo_model as model
 6 | from environment.environment import Environment
 7 | 
 8 | # MODEL_NAME = "flappyb-test-the-rainbow350"
 9 | MODEL_NAME = "best_+131.310_576000.dat"
10 | 
11 | env = env = Environment(draw=True, fps=10, debug=False,
12 |                       dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True)
13 | 
14 | net_act = model.ModelActor(env.observation_space.n,
15 |                            env.action_space.n).to("cpu")
16 | net_act.load_state_dict(torch.load("saves/ppo-test-flappyb/" + MODEL_NAME, map_location=lambda storage, loc: storage))    
17 | 
18 | rewards = 0.0
19 | steps = 0
20 | for _ in range(5):
21 |     obs = env.reset()
22 |     while True:
23 |         obs_v = ptan.agent.float32_preprocessor([obs]).to("cpu")
24 |         mu_v = net_act(obs_v)[0]
25 |         action = mu_v.squeeze(dim=0).data.cpu().numpy()
26 |         action = np.clip(action, -1, 1)
27 |         obs, reward, done, _ = env.step(action)
28 |         rewards += reward
29 |         steps += 1
30 |         if done:
31 |             break
32 | 


--------------------------------------------------------------------------------
/flappyb/play_self.py:
--------------------------------------------------------------------------------
 1 | from environment.environment import Environment
 2 | 
 3 | 
 4 | class Agent:
 5 | 
 6 |     def __init__(self):
 7 |         self.total_reward = 0
 8 | 
 9 |     def step(self, env):
10 |         env.get_observation_space()
11 |         action = env.get_action_random()
12 |         obs, reward, is_done, _ = env.step(action)
13 |         self.total_reward += reward
14 | 
15 | 
16 | # HUMAN PLAYS
17 | env = Environment(draw=True, fps=20, debug=True, dist_to_pipe=40,
18 |                   dist_between_pipes=150, obs_this_pipe=True)
19 | env.run_human_game()
20 | 
21 | 
22 | # RANDOM AGENT
23 | # agent = Agent()
24 | # env = Environment(True, 10)
25 | 
26 | # for i in range(10):
27 | # 	env.reset()
28 | # 	while not env.is_done:
29 | # 	    agent.step(env)
30 | 
31 | # print("Total reward = {}".format(agent.total_reward))
32 | 


--------------------------------------------------------------------------------
/flappyb/ppo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import math
  4 | import ptan
  5 | import time
  6 | import gym
  7 | import argparse
  8 | from tensorboardX import SummaryWriter
  9 | 
 10 | from lib import ppo_model as model
 11 | from environment.environment import Environment 
 12 | 
 13 | import numpy as np
 14 | import torch
 15 | import torch.optim as optim
 16 | import torch.nn.functional as F
 17 | 
 18 | 
 19 | GAMMA = 0.99
 20 | GAE_LAMBDA = 0.95
 21 | 
 22 | TRAJECTORY_SIZE = 2049
 23 | LEARNING_RATE_ACTOR = 1e-4
 24 | LEARNING_RATE_CRITIC = 1e-3
 25 | 
 26 | PPO_EPS = 0.2
 27 | PPO_EPOCHES = 10
 28 | PPO_BATCH_SIZE = 64
 29 | 
 30 | TEST_ITERS = 1000
 31 | 
 32 | 
 33 | def test_net(net, env, count=10, device="cpu"):
 34 |     rewards = 0.0
 35 |     steps = 0
 36 |     for _ in range(count):
 37 |         obs = env.reset()
 38 |         while True:
 39 |             obs_v = ptan.agent.float32_preprocessor([obs]).to(device)
 40 |             mu_v = net(obs_v)[0]
 41 |             action = mu_v.squeeze(dim=0).data.cpu().numpy()
 42 |             action = np.clip(action, -1, 1)
 43 |             obs, reward, done, _ = env.step(action)
 44 |             rewards += reward
 45 |             steps += 1
 46 |             if done:
 47 |                 break
 48 |     return rewards / count, steps / count
 49 | 
 50 | 
 51 | def calc_logprob(mu_v, logstd_v, actions_v):
 52 |     p1 = - ((mu_v - actions_v) ** 2) / (2*torch.exp(logstd_v).clamp(min=1e-3))
 53 |     p2 = - torch.log(torch.sqrt(2 * math.pi * torch.exp(logstd_v)))
 54 |     return p1 + p2
 55 | 
 56 | 
 57 | def calc_adv_ref(trajectory, net_crt, states_v, device="cpu"):
 58 |     """
 59 |     By trajectory calculate advantage and 1-step ref value
 60 |     :param trajectory: trajectory list
 61 |     :param net_crt: critic network
 62 |     :param states_v: states tensor
 63 |     :return: tuple with advantage numpy array and reference values
 64 |     """
 65 |     values_v = net_crt(states_v)
 66 |     values = values_v.squeeze().data.cpu().numpy()
 67 |     # generalized advantage estimator: smoothed version of the advantage
 68 |     last_gae = 0.0
 69 |     result_adv = []
 70 |     result_ref = []
 71 |     for val, next_val, (exp,) in zip(reversed(values[:-1]), reversed(values[1:]),
 72 |                                      reversed(trajectory[:-1])):
 73 |         if exp.done:
 74 |             delta = exp.reward - val
 75 |             last_gae = delta
 76 |         else:
 77 |             delta = exp.reward + GAMMA * next_val - val
 78 |             last_gae = delta + GAMMA * GAE_LAMBDA * last_gae
 79 |         result_adv.append(last_gae)
 80 |         result_ref.append(last_gae + val)
 81 | 
 82 |     adv_v = torch.FloatTensor(list(reversed(result_adv))).to(device)
 83 |     ref_v = torch.FloatTensor(list(reversed(result_ref))).to(device)
 84 |     return adv_v, ref_v
 85 | 
 86 | 
 87 | if __name__ == "__main__":
 88 |     # parser = argparse.ArgumentParser()
 89 |     # parser.add_argument("--cuda", default=False, action='store_true', help='Enable CUDA')
 90 |     # parser.add_argument("-n", "--name", required=True, help="Name of the run")
 91 |     # parser.add_argument("-e", "--env", default=ENV_ID, help="Environment id, default=" + ENV_ID)
 92 |     # args = parser.parse_args()
 93 | 
 94 |     name = "test-flappyb"
 95 | 
 96 |     # device = torch.device("cuda" if args.cuda else "cpu")
 97 |     device = torch.device("cpu")
 98 | 
 99 |     save_path = os.path.join("saves", "ppo-" + name)
100 |     os.makedirs(save_path, exist_ok=True)
101 | 
102 |     env = env = Environment(draw=False, fps=1, debug=False,
103 |                       dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True)
104 |     test_env = env = Environment(draw=False, fps=1, debug=False,
105 |                       dist_to_pipe=50, dist_between_pipes=180, obs_this_pipe=True)
106 | 
107 |     net_act = model.ModelActor(env.observation_space.n, env.action_space.n).to(device)
108 |     net_crt = model.ModelCritic(env.observation_space.n).to(device)
109 |     print(net_act)
110 |     print(net_crt)
111 | 
112 |     writer = SummaryWriter(comment="-ppo_" + name)
113 |     agent = model.AgentA2C(net_act, device=device)
114 |     exp_source = ptan.experience.ExperienceSource(env, agent, steps_count=1)
115 | 
116 |     opt_act = optim.Adam(net_act.parameters(), lr=LEARNING_RATE_ACTOR)
117 |     opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC)
118 | 
119 |     trajectory = []
120 |     best_reward = None
121 |     with ptan.common.utils.RewardTracker(writer) as tracker:
122 |         for step_idx, exp in enumerate(exp_source):
123 |             rewards_steps = exp_source.pop_rewards_steps()
124 |             if rewards_steps:
125 |                 rewards, steps = zip(*rewards_steps)
126 |                 writer.add_scalar("episode_steps", np.mean(steps), step_idx)
127 |                 tracker.reward(np.mean(rewards), step_idx)
128 | 
129 |             if step_idx % TEST_ITERS == 0:
130 |                 ts = time.time()
131 |                 rewards, steps = test_net(net_act, test_env, device=device)
132 |                 print("Test done in %.2f sec, reward %.3f, steps %d" % (
133 |                     time.time() - ts, rewards, steps))
134 |                 writer.add_scalar("test_reward", rewards, step_idx)
135 |                 writer.add_scalar("test_steps", steps, step_idx)
136 |                 if best_reward is None or best_reward < rewards:
137 |                     if best_reward is not None:
138 |                         print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards))
139 |                         name = "best_%+.3f_%d.dat" % (rewards, step_idx)
140 |                         fname = os.path.join(save_path, name)
141 |                         torch.save(net_act.state_dict(), fname)
142 |                     best_reward = rewards
143 | 
144 |             trajectory.append(exp)
145 |             if len(trajectory) < TRAJECTORY_SIZE:
146 |                 continue
147 | 
148 |             traj_states = [t[0].state for t in trajectory]
149 |             traj_actions = [t[0].action for t in trajectory]
150 |             traj_states_v = torch.FloatTensor(traj_states).to(device)
151 |             traj_actions_v = torch.FloatTensor(traj_actions).to(device)
152 |             traj_adv_v, traj_ref_v = calc_adv_ref(trajectory, net_crt, traj_states_v, device=device)
153 |             mu_v = net_act(traj_states_v)
154 |             old_logprob_v = calc_logprob(mu_v, net_act.logstd, traj_actions_v)
155 | 
156 |             # normalize advantages
157 |             traj_adv_v = (traj_adv_v - torch.mean(traj_adv_v)) / torch.std(traj_adv_v)
158 | 
159 |             # drop last entry from the trajectory, an our adv and ref value calculated without it
160 |             trajectory = trajectory[:-1]
161 |             old_logprob_v = old_logprob_v[:-1].detach()
162 | 
163 |             sum_loss_value = 0.0
164 |             sum_loss_policy = 0.0
165 |             count_steps = 0
166 | 
167 |             for epoch in range(PPO_EPOCHES):
168 |                 for batch_ofs in range(0, len(trajectory), PPO_BATCH_SIZE):
169 |                     states_v = traj_states_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE]
170 |                     actions_v = traj_actions_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE]
171 |                     batch_adv_v = traj_adv_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE].unsqueeze(-1)
172 |                     batch_ref_v = traj_ref_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE]
173 |                     batch_old_logprob_v = old_logprob_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE]
174 | 
175 |                     # critic training
176 |                     opt_crt.zero_grad()
177 |                     value_v = net_crt(states_v)
178 |                     loss_value_v = F.mse_loss(value_v.squeeze(-1), batch_ref_v)
179 |                     loss_value_v.backward()
180 |                     opt_crt.step()
181 | 
182 |                     # actor training
183 |                     opt_act.zero_grad()
184 |                     mu_v = net_act(states_v)
185 |                     logprob_pi_v = calc_logprob(mu_v, net_act.logstd, actions_v)
186 |                     ratio_v = torch.exp(logprob_pi_v - batch_old_logprob_v)
187 |                     surr_obj_v = batch_adv_v * ratio_v
188 |                     clipped_surr_v = batch_adv_v * torch.clamp(ratio_v, 1.0 - PPO_EPS, 1.0 + PPO_EPS)
189 |                     loss_policy_v = -torch.min(surr_obj_v, clipped_surr_v).mean()
190 |                     loss_policy_v.backward()
191 |                     opt_act.step()
192 | 
193 |                     sum_loss_value += loss_value_v.item()
194 |                     sum_loss_policy += loss_policy_v.item()
195 |                     count_steps += 1
196 | 
197 |             trajectory.clear()
198 |             writer.add_scalar("advantage", traj_adv_v.mean().item(), step_idx)
199 |             writer.add_scalar("values", traj_ref_v.mean().item(), step_idx)
200 |             writer.add_scalar("loss_policy", sum_loss_policy / count_steps, step_idx)
201 |             writer.add_scalar("loss_value", sum_loss_value / count_steps, step_idx)
202 | 


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+10.400_555000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+10.400_555000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+11.270_556000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+11.270_556000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+131.310_576000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+131.310_576000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+20.470_558000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+20.470_558000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+4.650_165000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+4.650_165000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+4.860_370000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+4.860_370000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+44.070_560000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+44.070_560000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+44.560_561000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+44.560_561000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+5.290_475000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+5.290_475000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+5.530_495000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+5.530_495000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+5.740_516000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+5.740_516000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+5.820_538000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+5.820_538000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+56.790_570000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+56.790_570000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+6.250_539000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+6.250_539000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+6.820_542000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+6.820_542000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+7.200_547000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+7.200_547000.dat


--------------------------------------------------------------------------------
/flappyb/saves/ppo-test-flappyb/best_+8.690_550000.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VieVaWaldi/ReinforcementLearning/cb1a4ba8ef3bb516874f303ddc449bef2ad989c5/flappyb/saves/ppo-test-flappyb/best_+8.690_550000.dat


--------------------------------------------------------------------------------
/old_agents/cross_entropy.py:
--------------------------------------------------------------------------------
  1 | # Nice entry, but cross entropy advanced is a whole lot better
  2 | 
  3 | import gym
  4 | from collections import namedtuple
  5 | import numpy as np
  6 | from tensorboardX import SummaryWriter
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.optim as optim
 11 | 
 12 | from flappyb.environment import Environment
 13 | 
 14 | NAME = 'cross-entropy-batchsize:16-hiddensize1:128'
 15 | WRITE = False
 16 | DRAW = True
 17 | 
 18 | HIDDEN_SIZE = 128									# num of neurons in hidden layer
 19 | BATCH_SIZE = 16										# number of episodes in a batch
 20 | PERCENTILE_THROW_AWAY = 70							# percentage of episodes in batch to not learn from
 21 | 
 22 | 
 23 | class Net(nn.Module):
 24 | 	def __init__(self, obs_size, hidden_size, n_actions):
 25 | 		super(Net, self).__init__()
 26 | 		self.net = nn.Sequential(
 27 | 			nn.Linear(obs_size, hidden_size),
 28 | 			nn.ReLU(),
 29 | 			nn.Linear(hidden_size, n_actions)
 30 | 		)
 31 | 													# output is a probability distribution 
 32 | 	def forward(self, x):							# ... over the actions
 33 | 		return self.net(x)
 34 | 
 35 | 
 36 | # helpers to represent single steps and episodes from the actor 
 37 | Episode = namedtuple('Episode', field_names=['reward', 'steps'])
 38 | EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])
 39 | 
 40 | 
 41 | def iterate_batches(env, net, batch_size):
 42 | 
 43 | 	batch = []
 44 | 	episode_reward = 0.0
 45 | 	episode_steps = []
 46 | 	obs = env.reset()
 47 | 	sm = nn.Softmax(dim=1) 							# converts net output (raw action score)
 48 | 													# ... to probability distribution
 49 | 	while True:
 50 | 		obs_v = torch.FloatTensor([obs])			# converts observation to tensor
 51 | 		act_probs_v = sm(net(obs_v))				# then generate action probability policy
 52 | 		act_probs = act_probs_v.data.numpy()[0]		# convert tensor back to array
 53 | 		
 54 | 		# choose an action according to the available probability
 55 | 		action = np.random.choice(len(act_probs), p=act_probs)
 56 | 		next_obs, reward, is_done, _ = env.step(action)
 57 | 
 58 | 		# use obs that we started whith in this episode 
 59 | 		episode_reward += reward
 60 | 		episode_steps.append(EpisodeStep(observation=obs, action=action))
 61 | 
 62 | 		# when episode (one single game) ends
 63 | 		if is_done:
 64 | 			# remember episode steps and clear environment
 65 | 			batch.append(Episode(reward=episode_reward, steps=episode_steps))
 66 | 			episode_reward = 0.0
 67 | 			episode_steps = []
 68 | 			next_obs = env.reset()
 69 | 
 70 | 			# when batch is complete (multiple episodes) pass it to the learning loop
 71 | 			if len(batch) == batch_size:
 72 | 				yield batch
 73 | 				batch = [] 
 74 | 
 75 | 		obs = next_obs
 76 | 
 77 | 
 78 | def filter_batch(batch, percentile):
 79 | 	rewards = list(map(lambda s: s.reward, batch))
 80 | 	reward_bound = np.percentile(rewards, percentile)
 81 | 	reward_mean = float(np.mean(rewards))
 82 | 
 83 | 	train_obs = []
 84 | 	train_act = []
 85 | 	
 86 | 	for example in batch:
 87 | 		if example.reward < reward_bound:
 88 | 			continue							# filters episodes
 89 | 		train_obs.extend(map(lambda step: step.observation, example.steps))
 90 | 		train_act.extend(map(lambda step: step.action, example.steps))
 91 | 
 92 | 	train_obs_v = torch.FloatTensor(train_obs)
 93 | 	train_act_v = torch.LongTensor(train_act)
 94 | 
 95 | 	# return elite episodes as tensors 
 96 | 	return train_obs_v, train_act_v, reward_bound, reward_mean
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 | 
101 | 	env = Environment(DRAW)
102 | 
103 | 	obs_size = env.get_observation_size()
104 | 	n_actions = env.get_action_size()
105 | 
106 | 	net = Net(obs_size, HIDDEN_SIZE, n_actions)
107 | 
108 | 	objective = nn.CrossEntropyLoss()			# main function to teach net
109 | 	optimizer = optim.Adam(params=net.parameters(), lr=0.01)
110 | 	writer = None
111 | 	if WRITE:
112 | 		writer = SummaryWriter(comment=NAME)
113 | 
114 | 	# actual training loop
115 | 	for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
116 | 		# filter bad episodes so only the best episodes of a batch remain
117 | 		obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE_THROW_AWAY)
118 | 		optimizer.zero_grad()
119 | 
120 | 		action_scores_v = net(obs_v) 			# pass obs to network again and retreive score
121 | 												# calculate cross entropy between net output and actions
122 | 												# ... the agent took inorder to learn the good actions
123 | 		loss_v = objective(action_scores_v, acts_v)			# calculate loss function
124 | 		loss_v.backward()						# apply gradient descent (not sure if this statement is correct)
125 | 		optimizer.step()						# optimize network
126 | 
127 | 		print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (iter_no, loss_v.item(), reward_m, reward_b))
128 | 		if WRITE:
129 | 			writer.add_scalar("reward_mean", reward_m, iter_no)
130 | 		if iter_no > 1000:
131 | 			print("500 steps should be sufficient")
132 | 			break
133 | 	if WRITE:
134 | 		writer.close()
135 | 


--------------------------------------------------------------------------------
/old_agents/cross_entropy_advanced.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from collections import namedtuple
  3 | import numpy as np
  4 | from tensorboardX import SummaryWriter
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.optim as optim
  9 | 
 10 | from flappyb.environment import Environment
 11 | 
 12 | HIDDEN_SIZE_1 = 256
 13 | BATCH_SIZE = 100
 14 | PERCENTILE = 30
 15 | LEARNING_RATE = 0.01
 16 | GAMMA = .99
 17 | 
 18 | NAME = 'batchsize=100-hiddensize=256-lr=0.01-gamma=.9'
 19 | NAME = 'batchsize=100-hiddensize=256-lr=0.01-gamma=.99'
 20 | WRITE = False
 21 | DRAW = False
 22 | SAVE_MODEL = False
 23 | 
 24 | 
 25 | class Net(nn.Module):
 26 |     def __init__(self, obs_size, n_actions):
 27 |         super(Net, self).__init__()
 28 |         self.net = nn.Sequential(
 29 |             nn.Linear(obs_size, HIDDEN_SIZE_1),
 30 |             nn.ReLU(),
 31 |             nn.Linear(HIDDEN_SIZE_1, HIDDEN_SIZE_1),
 32 |             nn.ReLU(),
 33 |             nn.Linear(HIDDEN_SIZE_1, n_actions)
 34 |         )
 35 | 
 36 |     def forward(self, x):
 37 |         return self.net(x)
 38 | 
 39 | 
 40 | Episode = namedtuple('Episode', field_names=['reward', 'steps'])
 41 | EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])
 42 | 
 43 | 
 44 | def iterate_batches(env, net, batch_size):
 45 |     batch = []
 46 |     episode_reward = 0.0
 47 |     episode_steps = []
 48 |     obs = env.reset()
 49 |     sm = nn.Softmax(dim=1)
 50 |     while True:
 51 |         obs_v = torch.FloatTensor([obs])
 52 |         act_probs_v = sm(net(obs_v))
 53 |         act_probs = act_probs_v.data.numpy()[0]
 54 |         # action = np.random.choice(len(act_probs), p=act_probs)
 55 |         action = env.get_action_random()
 56 | 
 57 |         next_obs, reward, is_done, _ = env.step(action)
 58 |         episode_reward += reward
 59 |         episode_steps.append(EpisodeStep(observation=obs, action=action))
 60 |         if is_done:
 61 |             batch.append(Episode(reward=episode_reward, steps=episode_steps))
 62 |             episode_reward = 0.0
 63 |             episode_steps = []
 64 |             next_obs = env.reset()
 65 |             if len(batch) == batch_size:
 66 |                 yield batch
 67 |                 batch = []
 68 |         obs = next_obs
 69 | 
 70 | 
 71 | def filter_batch(batch, percentile):
 72 |     disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch))
 73 |     # disc_rewards = list(map(lambda s: s.reward * (len(s.steps)), batch))
 74 |     reward_bound = np.percentile(disc_rewards, percentile)
 75 | 
 76 |     train_obs = []
 77 |     train_act = []
 78 |     elite_batch = []
 79 |     for example, discounted_reward in zip(batch, disc_rewards):
 80 |         if discounted_reward > reward_bound:
 81 |             train_obs.extend(map(lambda step: step.observation, example.steps))
 82 |             train_act.extend(map(lambda step: step.action, example.steps))
 83 |             elite_batch.append(example)
 84 | 
 85 |     return elite_batch, train_obs, train_act, reward_bound
 86 | 
 87 | 
 88 | if __name__ == "__main__":
 89 |     random.seed(12345)
 90 |     env = Environment(DRAW)         # activate save
 91 | 
 92 |     obs_size = env.get_observation_size()
 93 |     n_actions = env.get_action_size()
 94 | 
 95 |     net = Net(obs_size, n_actions)
 96 |     net.load_state_dict(torch.load('models/cross_entropy/{}-PART=240.pt'.format(NAME)))
 97 |     net.eval()
 98 | 
 99 |     # torch.save(net.state_dict(), 'models/cross_entropy/{}-PART=0.pt'.format(NAME))
100 | 
101 |     objective = nn.CrossEntropyLoss()
102 |     optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE)
103 | 
104 |     writer = None
105 |     if WRITE:
106 |         writer = SummaryWriter(comment=NAME)
107 | 
108 |     full_batch = []
109 |     for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
110 |         reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
111 |         full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE)
112 |         if not full_batch:
113 |             continue
114 |         obs_v = torch.FloatTensor(obs)
115 |         acts_v = torch.LongTensor(acts)
116 |         full_batch = full_batch[-500:]
117 | 
118 |         optimizer.zero_grad()
119 |         action_scores_v = net(obs_v)
120 |         loss_v = objective(action_scores_v, acts_v)
121 |         loss_v.backward()
122 |         optimizer.step()
123 |         print("%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f, batch=%d" % (
124 |             iter_no, loss_v.item(), reward_mean, reward_bound, len(full_batch)))
125 |         if WRITE:
126 |             writer.add_scalar("reward", reward_mean, iter_no)
127 |         if (iter_no % 30 == 0) and SAVE_MODEL :
128 |             torch.save(net.state_dict(), 'models/cross_entropy/{}-PART={}.pt'.format(NAME, iter_no))
129 |             pass
130 |         if iter_no > 10000:
131 |             print("That should be enough!")
132 |             break
133 | 
134 |     if WRITE:
135 |         writer.close()
136 | 


--------------------------------------------------------------------------------
/old_agents/dqn_snake_v2.py:
--------------------------------------------------------------------------------
  1 | # READ ME PEASE
  2 | 
  3 | # https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288
  4 | # LOSSS FUNCTIONS: https://medium.com/udacity-pytorch-challengers/a-brief-overview-of-loss-functions-in-pytorch-c0ddb78068f7
  5 | # BEST OPTIMIZER: https://medium.com/octavian-ai/which-optimizer-and-learning-rate-should-i-use-for-deep-learning-5acb418f9b2
  6 | 
  7 | import random
  8 | import numpy as np
  9 | from collections import deque
 10 | 
 11 | import keras
 12 | from keras.models import Sequential
 13 | from keras.layers import Dense
 14 | from keras.optimizers import Adam
 15 | from keras.models import load_model
 16 | from keras.layers import Dense, Dropout, Activation
 17 | 
 18 | from snake.environment import Environment
 19 | 
 20 | from tensorboardX import SummaryWriter
 21 |     
 22 | GAMMA = 0.9             # try .99
 23 | LEARNING_RATE = 0.001   # deafult was 0.001 
 24 | LEARNING_WITH_DECAY = 0.01    
 25 | 
 26 | MEMORY_SIZE = 1000000
 27 | BATCH_SIZE = 20
 28 | 
 29 | EXPLORATION_MAX = 1
 30 | EXPLORATION_MIN = 0.01
 31 | EXPLORATION_DECAY = 0.99995
 32 | 
 33 | #####################################################################################################
 34 | NAME = 'snake_dqn/-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization'
 35 | WRITE = True
 36 | DRAW = False
 37 | SAVE_MODEL = True
 38 | LOAD_NAME = 'snake_dqn/-expdecay=0.99995-gamma=.9-batchsize=20-nn=512-lr=0.001-normalization-PART=0'
 39 | #####################################################################################################
 40 | 
 41 | 
 42 | 
 43 | class DQNSolver:
 44 | 
 45 |     def __init__(self, observation_space, action_space, model = None):
 46 |         self.exploration_rate = EXPLORATION_MAX
 47 | 
 48 |         self.action_space = action_space
 49 |         self.memory = deque(maxlen=MEMORY_SIZE)
 50 | 
 51 |         if model is None:
 52 |             print('new model')
 53 |             self.model = Sequential()
 54 |             self.model.add(Dense(512, input_shape=(observation_space,), activation="relu")) # andere aktivierungs funktion
 55 |             self.model.add(Dense(512, activation="relu"))
 56 |             # self.model.add(Dropout(0.85))
 57 |             # self.model.add(Dense(512, activation="relu"))
 58 |             self.model.add(Dense(self.action_space, activation="linear"))       # Linear sucks? maybe try softmax
 59 |             self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))    # Try learning rate deacy
 60 |             # self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_WITH_DECAY, decay=1e-6))
 61 |         else:
 62 |             print('saved model loaded')
 63 |             self.model = model
 64 | 
 65 |     def remember(self, state, action, reward, next_state, done):
 66 |         self.memory.append((state, action, reward, next_state, done))
 67 | 
 68 |     def act(self, state, env):
 69 |         if np.random.rand() < self.exploration_rate:
 70 |             return env.get_action_random()
 71 |         q_values = self.model.predict(state)
 72 |         return np.argmax(q_values[0])
 73 | 
 74 |     def act_free(self, state):
 75 |         q_values = self.model.predict(state)
 76 |         return np.argmax(q_values[0])
 77 | 
 78 |     def experience_replay(self):
 79 |         if len(self.memory) < BATCH_SIZE:
 80 |             return
 81 |         batch = random.sample(self.memory, BATCH_SIZE)
 82 |         for state, action, reward, state_next, terminal in batch:
 83 |             q_update = reward
 84 |             if not terminal:
 85 |                 q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
 86 |             q_values = self.model.predict(state)
 87 |             q_values[0][action] = q_update
 88 |             self.model.fit(state, q_values, verbose=0)
 89 |         self.exploration_rate *= EXPLORATION_DECAY
 90 |         self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)
 91 | 
 92 | 
 93 | def learn_snake():
 94 |     env = Environment(DRAW, 1, False)
 95 |     writer = None
 96 |     if WRITE:
 97 |         writer = SummaryWriter(comment=NAME)
 98 |     observation_space = env.get_observation_size_buffer()
 99 |     action_space = env.get_action_size()
100 |     
101 |     #model = load_model('models/dqn/newenv/{}.h5'.format(LOAD_NAME))
102 |     dqn_solver = DQNSolver(observation_space, action_space) #, model)
103 |     run = 0
104 |     if SAVE_MODEL:
105 |             name = '{}-PART={}'.format(NAME, run)
106 |             dqn_solver.model.save('models/dqn/{}.h5'.format(name))
107 |     while True:
108 |         run += 1
109 |         state = env.reset()
110 |         state = np.reshape(state, [1, observation_space])
111 |         step = 0
112 |         reward_score = 0
113 |         while True:
114 |             step += 1
115 |             action = dqn_solver.act(state, env)
116 |             state_next, reward, terminal, info = env.step_buffer(action)
117 |             reward_score += reward
118 |             state_next = np.reshape(state_next, [1, observation_space])
119 |             dqn_solver.remember(state, action, reward, state_next, terminal)
120 |             state = state_next
121 |             if terminal:
122 |                 print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(reward_score))
123 |                 if WRITE:
124 |                 	writer.add_scalar("reward", reward_score, run)
125 |                 break
126 |             dqn_solver.experience_replay()
127 |         if (run % 100 == 0) and SAVE_MODEL:
128 |             name = '{}-PART={}'.format(NAME, run)
129 |             dqn_solver.model.save('models/dqn/{}.h5'.format(name))
130 |     if WRITE:
131 |     	writer.close()
132 | 
133 | 
134 | 
135 | def play_snake():
136 |     env = Environment(True, 1, False)
137 | 
138 |     observation_space = env.get_observation_size_buffer()
139 |     action_space = env.get_action_size()
140 |     
141 |     model = keras.models.load_model('models/dqn/{}.h5'.format(LOAD_NAME))
142 |     dqn_solver = DQNSolver(observation_space, action_space, model)
143 | 
144 |     for i in range(20):
145 |         state = env.reset()
146 |         state = np.reshape(state, [1, observation_space])
147 |         is_done = False
148 |         while not is_done:
149 |             action = dqn_solver.act_free(state)
150 |             # action = env.get_action_random()
151 |             state_next, reward, terminal, info = env.step_buffer(action)
152 |             is_done = terminal
153 |             state = np.reshape(state_next, [1, observation_space])
154 | 
155 | 
156 | 
157 | if __name__ == "__main__":
158 |     # learn_snake()
159 |     play_snake()
160 |     
161 |     print('Jobe Done!')
162 | 


--------------------------------------------------------------------------------
/old_agents/q_iteration.py:
--------------------------------------------------------------------------------
  1 | # value_iteration does the same but converges faster
  2 | 
  3 | import gym
  4 | from flappyb.environment import Environment
  5 | import collections
  6 | from tensorboardX import SummaryWriter
  7 | import random
  8 | import numpy as np
  9 | 
 10 | GAMMA = 0.9
 11 | TEST_EPISODES = 5
 12 | TRAINING_STEPS = 3000
 13 | 
 14 | WRITE = False
 15 | DRAW_TRAINING = False
 16 | DRAW = False
 17 | NAME = 'q-iteration-gamma:0.2-trainingsteps:3000-newenv-roundto:1'
 18 | 
 19 | 
 20 | class Agent:
 21 |     def __init__(self):
 22 |         # self.env = gym.make(ENV_NAME)
 23 |         self.env = Environment(DRAW_TRAINING)
 24 |         self.state = self.env.reset()
 25 |         self.rewards = collections.defaultdict(float)
 26 |         self.transits = collections.defaultdict(collections.Counter)
 27 |         self.values = collections.defaultdict(float)
 28 | 
 29 |     def play_n_random_steps(self, count):
 30 |         # rand = random.uniform(0.2, 0.8)     # more or less and he does nothing
 31 |         for _ in range(count):
 32 |             # if _ % 1000 == 0:
 33 |             #     rand = random.uniform(0.2, 0.8)
 34 |             #     print(rand)
 35 |             # action = np.random.choice((0, 1), 1, p=(rand, 1 - rand))
 36 |             # action = action.item(0)
 37 |             action = self.env.get_action_random()
 38 |             new_state, reward, is_done, _ = self.env.step(action)
 39 |             self.rewards[(self.state, action, new_state)] = reward
 40 |             self.transits[(self.state, action)][new_state] += 1
 41 |             self.state = self.env.reset() if is_done else new_state
 42 |             # print(len(self.transits))
 43 | 
 44 |     def select_action(self, state):
 45 |         best_action, best_value = None, None
 46 |         for action in range(self.env.get_action_size()):
 47 |             action_value = self.values[(state, action)]
 48 |             if best_value is None or best_value < action_value:
 49 |                 best_value = action_value
 50 |                 best_action = action
 51 |         return best_action
 52 | 
 53 |     def play_episode(self, env):
 54 |         total_reward = 0.0
 55 |         state = env.reset()
 56 |         while True:
 57 |             action = self.select_action(state)
 58 |             new_state, reward, is_done, _ = env.step(action)
 59 |             self.rewards[(state, action, new_state)] = reward
 60 |             self.transits[(state, action)][new_state] += 1
 61 |             total_reward += reward
 62 |             if is_done:
 63 |                 break
 64 |             state = new_state
 65 |         return total_reward
 66 | 
 67 |     def value_iteration(self):
 68 |         for state in range(self.env.get_observation_size()):
 69 |             for action in range(self.env.get_action_size()):
 70 |                 action_value = 0.0
 71 |                 target_counts = self.transits[(state, action)]
 72 |                 total = sum(target_counts.values())
 73 |                 for tgt_state, count in target_counts.items():
 74 |                     reward = self.rewards[(state, action, tgt_state)]
 75 |                     best_action = self.select_action(tgt_state)
 76 |                     action_value += (count / total) * (reward + GAMMA * self.values[(tgt_state, best_action)])
 77 |                 self.values[(state, action)] = action_value
 78 | 
 79 | 
 80 | if __name__ == "__main__":
 81 |     test_env = Environment(DRAW)
 82 |     agent = Agent()
 83 |     writer = None
 84 |     if WRITE:
 85 |         writer = SummaryWriter(comment=NAME)
 86 | 
 87 |     iter_no = 0
 88 |     best_reward = 0.0
 89 |     while True:
 90 |         iter_no += 1
 91 |         print('#', iter_no)
 92 |         agent.play_n_random_steps(TRAINING_STEPS)
 93 |         agent.value_iteration()
 94 | 
 95 |         reward = 0.0
 96 |         for _ in range(TEST_EPISODES):
 97 |             reward += agent.play_episode(test_env)
 98 |         reward /= TEST_EPISODES
 99 |         if WRITE:
100 |             writer.add_scalar("reward", reward, iter_no)
101 |         if reward > best_reward:
102 |             print("Best reward updated %.3f -> %.3f" % (best_reward, reward))
103 |             best_reward = reward
104 |         if reward > 0.80:
105 |             print("Solved in %d iterations!" % iter_no)
106 |             break
107 |     if WRITE:
108 |         writer.close()
109 | 


--------------------------------------------------------------------------------
/old_agents/q_learning.py:
--------------------------------------------------------------------------------
 1 | import gym 
 2 | import collections
 3 | from tensorboardX import SummaryWriter
 4 | from flappyb.environment import Environment
 5 | 
 6 | GAMMA = 0.9
 7 | ALPHA = 0.2
 8 | TEST_EPISODES = 20
 9 | 
10 | NAME = 'q-learning'
11 | WRITE = False
12 | DRAW_TRAINING = False
13 | DRAW = False
14 | 
15 | class Agent:
16 |     def __init__(self):
17 |         self.env = Environment(DRAW_TRAINING)
18 |         self.state = self.env.reset()
19 |         self.values = collections.defaultdict(float)    # less memory wasted, only store q-values
20 | 
21 |     # get s, a, r ,ns
22 |     def sample_env(self):
23 |         action = self.env.get_action_random()
24 |         old_state = self.state
25 |         new_state, reward, is_done, _ = self.env.step(action)
26 |         self.state = self.env.reset() if is_done else new_state
27 |         return old_state, action, reward, new_state
28 | 
29 |     # iterate over all action values and return the best one
30 |     def best_value_and_action(self, state):
31 |         best_value, best_action = None, None
32 |         for action in range(self.env.get_action_size()):
33 |             action_value = self.values[(state, action)]
34 |             if best_value is None or best_value < action_value:
35 |                 best_value = action_value
36 |                 best_action = action
37 |         return best_value, best_action
38 | 
39 |     # q-value is calculated for s, a and stored in table
40 |     def value_update(self, s, a, r, next_s):
41 |         best_v, _ = self.best_value_and_action(next_s)
42 |         new_val = r + GAMMA * best_v
43 |         old_val = self.values[(s, a)]
44 |         self.values[(s, a)] = old_val * (1 - ALPHA) + new_val * ALPHA
45 | 
46 |     # value table is not altered, only measures agent
47 |     def play_episode(self, env):
48 |         total_reward = 0.0
49 |         state = env.reset()
50 |         while True:
51 |             _, action = self.best_value_and_action(state)
52 |             new_state, reward, is_done, _ = env.step(action)
53 |             total_reward += reward
54 |             if is_done: 
55 |                 break
56 |             state = new_state
57 |         return total_reward
58 | 
59 | if __name__ == "__main__":
60 |     test_env = Environment(DRAW)
61 |     agent = Agent()
62 |     writer = None
63 |     if WRITE:
64 |         writer = SummaryWriter(comment=NAME)
65 |     iter_no = 0
66 |     best_reward = 0.0
67 |     while True:
68 |         iter_no += 1
69 |         print('#', iter_no)
70 |         s, a, r, next_s = agent.sample_env()
71 |         agent.value_update(s, a, r, next_s)
72 | 
73 |         reward = 0.0
74 |         for _ in range(TEST_EPISODES):
75 |             reward += agent.play_episode(test_env)
76 |         reward /= TEST_EPISODES
77 | 
78 |         if WRITE:
79 |             writer.add_scalar('reward', reward, iter_no)
80 |         if reward > best_reward:
81 |             print('Best reward updated %.3f => %.3f' %(best_reward, reward))
82 |             best_reward = reward
83 |         if reward > 0.9:
84 |             print('Solved in %d iterations' %iter_no)
85 |             break
86 |     if WRITE:
87 |         writer.close()


--------------------------------------------------------------------------------
/old_agents/value_iteration.py:
--------------------------------------------------------------------------------
  1 | # This sucks for flappy B
  2 | # CURRENTLY NO ROUNDING 
  3 | # saving not needed because it just sucks
  4 | # Take this for presentation, not q-learning or q-iteration
  5 | 
  6 | import collections
  7 | from tensorboardX import SummaryWriter
  8 | from flappyb.environment import Environment
  9 | import random
 10 | import numpy as np
 11 | 
 12 | GAMMA = .9
 13 | TEST_EPISODES = 5
 14 | TRAINING_STEPS = 3000
 15 | 
 16 | # NAME = 'gamma=0.9-trainingsteps:3000-rounding=None'
 17 | NAME = 'gamma=0.9-trainingsteps:3000-rounding=2'
 18 | WRITE = True
 19 | DRAW_TRAINING = False
 20 | DRAW = False
 21 | 
 22 | 
 23 | class Agent:
 24 |     def __init__(self):
 25 |         self.env = Environment(DRAW_TRAINING)
 26 |         self.state = self.env.reset()
 27 |         self.rewards = collections.defaultdict(float)
 28 |         self.transits = collections.defaultdict(collections.Counter)
 29 |         self.values = collections.defaultdict(float)
 30 | 
 31 |     def play_n_random_steps(self, count):
 32 |         for _ in range(count):
 33 |             action = self.env.get_action_random()
 34 |             new_state, reward, is_done, _ = self.env.step(action)
 35 |             self.rewards[(self.state, action, new_state)] = reward
 36 |             self.transits[(self.state, action)][new_state] += 1
 37 |             self.state = self.env.reset() if is_done else new_state
 38 | 
 39 |     def calc_action_value(self, state, action):
 40 |         target_counts = self.transits[(state, action)]
 41 |         total = sum(target_counts.values())
 42 |         action_value = 0.0
 43 |         for tgt_state, count in target_counts.items():
 44 |             reward = self.rewards[(state, action, tgt_state)]
 45 |             action_value += (count / total) * (reward + GAMMA * self.values[tgt_state])
 46 |         return action_value
 47 | 
 48 |     def select_action(self, state):
 49 |         best_action, best_value = None, None
 50 |         # for action in range(self.env.action_space.n):
 51 |         for action in range(self.env.get_action_size()):
 52 |             action_value = self.calc_action_value(state, action)
 53 |             if best_value is None or best_value < action_value:
 54 |                 best_value = action_value
 55 |                 best_action = action
 56 |         return best_action
 57 | 
 58 |     def play_episode(self, env):
 59 |         total_reward = 0.0
 60 |         state = env.reset()
 61 |         while True:
 62 |             action = self.select_action(state)
 63 |             new_state, reward, is_done, _ = env.step(action)
 64 |             self.rewards[(state, action, new_state)] = reward
 65 |             self.transits[(state, action)][new_state] += 1
 66 |             total_reward += reward
 67 |             if is_done:
 68 |                 break
 69 |             state = new_state
 70 |         return total_reward
 71 | 
 72 |     def value_iteration(self):
 73 |         for state in range(self.env.get_observation_size()):
 74 |             state_values = [self.calc_action_value(state, action) for action in range(self.env.get_action_size())]
 75 |             self.values[state] = max(state_values)
 76 | 
 77 | 
 78 | if __name__ == "__main__":
 79 |     test_env = Environment(DRAW)
 80 |     agent = Agent()
 81 |     writer = None
 82 |     if WRITE:
 83 |         writer = SummaryWriter(comment='v_iteration/{}'.format(NAME))
 84 | 
 85 |     iter_no = 0
 86 |     best_reward = 0.0
 87 |     while True:
 88 |         iter_no += 1
 89 |         print('#', iter_no)
 90 |         agent.play_n_random_steps(TRAINING_STEPS)
 91 |         agent.value_iteration()
 92 | 
 93 |         reward = 0.0
 94 |         for _ in range(TEST_EPISODES):
 95 |             reward += agent.play_episode(test_env)
 96 |         reward /= TEST_EPISODES
 97 |         if WRITE:
 98 |             writer.add_scalar("reward", reward, iter_no)
 99 |         if reward > best_reward:
100 |             print("Best reward updated %.3f -> %.3f" % (best_reward, reward))
101 |             best_reward = reward
102 |         if reward > 500:
103 |             print("Solved in %d iterations!" % iter_no)
104 |             break
105 |     if WRITE:
106 |         writer.close()
107 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.15.4
 2 | atari-py==0.1.6
 3 | gym==0.10.9
 4 | ptan==0.3
 5 | opencv-python==3.4.3.18
 6 | scipy==1.1.0
 7 | torch==0.4.1
 8 | torchvision==0.2.1
 9 | tensorboard-pytorch==0.7.1
10 | tensorflow==1.12.0
11 | tensorboard==1.12.0
12 | pybullet==2.3.6
13 | matplotlib==3.0.2
14 | pygame
15 | 
16 | Some things are missing, sorry
17 | 


--------------------------------------------------------------------------------
/runTensorBoard:
--------------------------------------------------------------------------------
1 | tensorboard --logdir $1 --host localhost
2 | 


--------------------------------------------------------------------------------
/snake/base_ppo.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import random
 3 | 
 4 | from environment.environment import SnakeEnvironment
 5 | 
 6 | from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
 7 | from stable_baselines.common.policies import MlpPolicy
 8 | from stable_baselines import PPO1
 9 | 
10 | 
11 | if __name__ == "__main__":
12 | 
13 | 	env = SnakeEnvironment(draw=True, fps=100, debug=False, animation=False)
14 | 	# env = SubprocVecEnv([lambda: env])
15 | 	env = DummyVecEnv([lambda: env])
16 | 
17 | 	# model = PPO1(MlpPolicy, env, verbose=1)
18 | 
19 | 	# model.learn(total_timesteps=500000)
20 | 	# model.save('models/snake-bastard')
21 | 
22 | 	###############################################################################
23 | 
24 | 	# env = gym.make('snake-v0')
25 | 	# # env = DummyVecEnv([lambda: env])
26 | 
27 | 	# # model = PPO2(MlpPolicy, env, verbose=1)
28 | 	# # model.load('models/snake-basterd')
29 | 
30 | 	obs = env.reset()
31 | 	is_done = False
32 | 
33 | 	while not is_done:
34 | 	    # action, _states = model.predict(obs)
35 | 	    action = random.randint(0, 4)
36 | 	    obs, rewards, terminal, info = env.step(action)
37 | 	    is_done = terminal
38 | 


--------------------------------------------------------------------------------
/snake/env_new/cube.py:
--------------------------------------------------------------------------------
 1 | import pygame
 2 | 
 3 | 
 4 | class Cube(object):
 5 | 
 6 |     def __init__(self, pos, rows, w, dirnx=1, dirny=0, color=(255, 0, 0)):
 7 | 
 8 |         self.pos = pos
 9 |         self.dirnx = dirnx
10 |         self.dirny = dirny
11 | 
12 |         self.rows = rows
13 |         self.w = w
14 | 
15 |         self.color = color
16 | 
17 |     def move(self, dirnx, dirny):
18 |         self.dirnx = dirnx
19 |         self.dirny = dirny
20 |         self.pos = (self.pos[0] + self.dirnx, self.pos[1] + self.dirny)
21 | 
22 |     def draw(self, surface, eyes=False):
23 |         dis = self.w // self.rows
24 |         i = self.pos[0]
25 |         j = self.pos[1]
26 | 
27 |         pygame.draw.rect(surface, self.color, (i*dis+1,j*dis+1, dis-2, dis-2))
28 |         if eyes:
29 |             centre = dis//2
30 |             radius = 3
31 |             circleMiddle = (i*dis+centre-radius,j*dis+8)
32 |             circleMiddle2 = (i*dis + dis -radius*2, j*dis+8)
33 |             pygame.draw.circle(surface, (0,0,0), circleMiddle, radius)
34 |             pygame.draw.circle(surface, (0,0,0), circleMiddle2, radius)


--------------------------------------------------------------------------------
/snake/env_new/environment.py:
--------------------------------------------------------------------------------
  1 | from environment.cube import Cube
  2 | from environment.snake import Snake
  3 | 
  4 | import gym
  5 | import pygame
  6 | 
  7 | import numpy as np
  8 | import random
  9 | import enum
 10 | import time
 11 | 
 12 | #import tkinker as tk
 13 | #from tkinter import messagebox
 14 | 
 15 | # snake obs
 16 | # body = head 0.9, b[0] = 0.8, b[1] = 0.79 ...
 17 | 
 18 | 
 19 | W = 500
 20 | H = 500
 21 | BUFFER_SIZE = 1
 22 | 
 23 | 
 24 | class Actions(enum.Enum):
 25 |     Up = 0
 26 |     Right = 1
 27 |     Down = 2
 28 |     Left = 3
 29 | 
 30 | 
 31 | class SnakeEnvironment(gym.Env):
 32 | 
 33 |     def __init__(self, draw=True, speed=10000, rows=20, animation=True):
 34 |         super(SnakeEnvironment, self).__init__()
 35 | 
 36 |         self.observation_space = gym.spaces.Discrete(n=rows * rows)
 37 |         self.action_space = gym.spaces.Discrete(n=len(Actions))
 38 | 
 39 |         self.draw = draw
 40 |         self.speed = speed
 41 |         self.rows = rows
 42 |         self.animation = animation
 43 | 
 44 |         self.snake = Snake((255, 0, 0), (2, 2), self.rows, W)
 45 |         self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0))
 46 |         
 47 |         self.is_done = False
 48 |         self.reward = 0
 49 |         self.step_without_apple = 0
 50 | 
 51 |         self.surf = pygame.display.set_mode((W, H))
 52 |         self.clock = pygame.time.Clock()
 53 | 
 54 |         if draw:
 55 |             pygame.init()
 56 |             self.font_game_over = pygame.font.SysFont("ani", 72)
 57 | 
 58 |     """ Must alwasy be calles in the beginning. """
 59 |     def reset(self):
 60 |         self.countdown()
 61 | 
 62 |         self.snake.reset((2, 2))
 63 |         self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0))
 64 |         self.is_done = False
 65 |         self.reward = 0
 66 |         self.step_without_apple = 0
 67 | 
 68 |         self.surf = pygame.display.set_mode((W, H))
 69 |         self.clock = pygame.time.Clock()
 70 | 
 71 |         obs, reward, is_done, _ = self.step(1)
 72 | 
 73 |         return obs
 74 | 
 75 |     def step(self, action):
 76 |         pygame.time.delay(50)                      # lower is faster
 77 |         self.clock.tick(self.speed)                # lower is slower
 78 | 
 79 |         if isinstance(action, np.ndarray):
 80 |             idx = -1
 81 |             highest_idx = 0
 82 |             highest_val = -1
 83 |             for i in action:
 84 |                 idx += 1
 85 |                 if i > highest_val:
 86 |                     highest_idx = idx
 87 |                     highest_val = i
 88 |             action = highest_idx
 89 | 
 90 |         current_reward = 0
 91 | 
 92 |         self.snake.move_ai(action)
 93 |         # self.snake.move_human()
 94 | 
 95 |         if self.snake.ate_itself():
 96 |             current_reward = -1
 97 |             self.game_over()
 98 | 
 99 |         self.step_without_apple += 1
100 |         if self.step_without_apple == 250:
101 |             self.game_over()
102 | 
103 |         if self.snake.body[0].pos == self.snack.pos:
104 |             self.snake.add_cube()
105 |             self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0))
106 |             self.reward += 1
107 |             current_reward = 1
108 |             self.step_without_apple = 0
109 | 
110 |         self.redraw_window()
111 | 
112 |         obs = self.get_observation_space()
113 | 
114 |         return obs, current_reward, self.is_done, None
115 | 
116 |     def get_observation_space(self):
117 | 
118 |         new_obs = []
119 | 
120 |         # create 2d matrix
121 |         for i in range(self.rows):
122 |             new_obs.append([])
123 |             for j in range(self.rows):
124 |                 new_obs[i].append(-1)
125 | 
126 |         # add apple
127 |         x_apple = self.snack.pos[0]
128 |         y_apple = self.snack.pos[1]
129 |         new_obs[y_apple][x_apple] = 1
130 | 
131 |         # add snake
132 |         x_snake = self.snake.head.pos[0]
133 |         y_snake = self.snake.head.pos[1]
134 |         if x_snake == -1 or x_snake == self.rows:
135 |             print('Wtf, this error occured!')
136 |             self.game_over()
137 |             return
138 |         if y_snake == -1 or y_snake == self.rows:
139 |             print('Wtf, this error occured!')
140 |             self.game_over()
141 |             return
142 |         new_obs[y_snake][x_snake] = 0.8
143 | 
144 |         # tail
145 |         for i, c in enumerate(self.snake.body):
146 |             x_snake = c.pos[0]
147 |             y_snake = c.pos[1]
148 | 
149 |             if x_snake == -1 or x_snake == self.rows:
150 |                 print('Wtf, this error occured!')
151 |                 self.game_over()
152 |                 return
153 |             if y_snake == -1 or y_snake == self.rows:
154 |                 print('Wtf, this error occured!')
155 |                 self.game_over()
156 |                 return
157 | 
158 |             new_obs[y_snake][x_snake] = 0.5
159 | 
160 |         current_obs = []
161 |         for i in new_obs:
162 |             for j in i:
163 |                 current_obs.append(j)
164 | 
165 |         # cnt = 0
166 |         # for i in current_obs:
167 |         #     cnt += 1
168 |         #     print(' ', i, ' ', end='')
169 |         #     if cnt % self.rows == 0:
170 |         #         print('')
171 |         # print('')
172 | 
173 |         return_obs = np.array(current_obs)
174 | 
175 |         return return_obs
176 | 
177 |     def draw_grid(self):
178 |         size_btwn = W // self.rows
179 | 
180 |         x = 0
181 |         y = 0
182 | 
183 |         for i in range(self.rows):
184 |             x = x + size_btwn
185 |             y = y + size_btwn
186 | 
187 |             pygame.draw.line(self.surf, (255, 255, 255), (x, 0), (x, W))
188 |             pygame.draw.line(self.surf, (255, 255, 255), (0, y), (W, y))
189 | 
190 |     def redraw_window(self):
191 |         if not self.draw:
192 |             return 
193 | 
194 |         self.surf.fill((0, 0, 0))
195 |         self.draw_grid()
196 |         self.snake.draw(self.surf)
197 |         self.snack.draw(self.surf)
198 | 
199 |         pygame.display.update()
200 | 
201 |     def random_snack(self):
202 |         positions = self.snake.body
203 |      
204 |         while True:
205 |             x = random.randrange(self.rows)
206 |             y = random.randrange(self.rows)
207 |             if len(list(filter(lambda z:z.pos == (x,y), positions))) > 0:
208 |                 continue
209 |             else:
210 |                 break
211 |         return (x,y)
212 | 
213 |     def countdown(self):
214 |         if not self.draw or not self.animation:
215 |             return
216 |         for _ in range(3, 0, -1):
217 |             self.write_text("Start in {}".format(_))
218 |             time.sleep(0.3)
219 | 
220 |     def game_over(self):
221 |         self.is_done = True
222 |         if not self.draw or not self.animation:
223 |             return
224 |         self.write_text("Score {}".format(self.reward))
225 |         time.sleep(1.5)
226 | 
227 |     def write_text(self, text):
228 |         self.redraw_window()
229 |         text_start = pygame.font.SysFont('dyuthi', 80). \
230 |             render(text, True, (255, 255, 255))
231 |         self.surf.blit(text_start,
232 |                          (text_start.get_width() //
233 |                           2, text_start.get_height() // 2))
234 |         pygame.display.flip()
235 | 
236 |     def play_human(self):
237 |         self.countdown()
238 | 
239 |         while(not self.is_done):
240 |             pygame.time.delay(50)                      # lower is faster
241 |             self.clock.tick(self.speed)                # lower is slower
242 | 
243 |             self.snake.move_human()
244 | 
245 |             if self.snake.ate_itself():
246 |                 self.game_over()
247 | 
248 |             if self.snake.body[0].pos == self.snack.pos:
249 |                 self.snake.add_cube()
250 |                 self.snack = Cube(self.random_snack(), self.rows, W, color=(0, 255, 0))
251 |                 self.reward += 1
252 | 
253 |             self.redraw_window()
254 |             self.get_observation_space()
255 | 
256 | 
257 | if __name__ == "__main__":
258 |     env = SnakeEnvironment(draw=True, speed=100, rows=5)
259 |     env.play_human()
260 | 
261 | 
262 | 
263 | 
264 |         #######
265 |         # if self.last_observation == None:
266 |         #     self.last_observation = current_obs
267 | 
268 |         # return_obs = []
269 | 
270 |         # for i in self.last_observation:
271 |         #     return_obs.append(i)
272 |         # for i in current_obs:
273 |         #     return_obs.append(i)
274 | 
275 |         # return_obs = np.array(return_obs)
276 | 
277 |         # cnt = 0
278 |         # for i in return_obs:
279 |         #     cnt += 1
280 |         #     print(' ', i, ' ', end='')
281 |         #     if cnt % 10 == 0: 
282 |         #         print('')
283 |         #     if cnt % 100 == 0:
284 |         #         print('')
285 |         #         print('')
286 |         # print('')
287 | 
288 |         # self.last_observation = current_obs
289 |         #######
290 | 


--------------------------------------------------------------------------------
/snake/env_new/self_play.py:
--------------------------------------------------------------------------------
 1 | # from environment.environment import SnakeEnvironment
 2 | 
 3 | env = SnakeEnvironment(draw=True, speed=100000, rows=5)
 4 | 
 5 | env.reset()
 6 | terminal = False
 7 | 
 8 | while not terminal:
 9 |     action = random.randint(0, 4)
10 |     next_state, reward, is_done, _ = env.step(action)
11 |     terminal = is_done
12 | 


--------------------------------------------------------------------------------
/snake/env_new/snake.py:
--------------------------------------------------------------------------------
  1 | from environment.cube import Cube
  2 | 
  3 | import pygame
  4 | 
  5 | 
  6 | class Snake(object):
  7 | 
  8 |     body = []
  9 |     turns = {}
 10 | 
 11 |     def __init__(self, color, pos, rows, w):
 12 |         self.head = Cube(pos, rows, w)
 13 |         self.body.append(self.head)
 14 | 
 15 |         self.rows = rows
 16 |         self.w = w
 17 | 
 18 |         self.color = color
 19 | 
 20 |         self.dirnx = 0
 21 |         self.dirny = 0
 22 | 
 23 |         self.add_cube()
 24 |         self.add_cube()
 25 | 
 26 |     def move_ai(self, action):
 27 |         x = self.head.pos[0]
 28 |         y = self.head.pos[1]
 29 | 
 30 |         if y == 0 and action == 0:
 31 |             action = -1
 32 |         elif x == self.rows -1 and action == 1:
 33 |             action = -1
 34 |         elif y == self.rows -1 and action == 2:
 35 |             action = -1
 36 |         elif x == 0 and action == 3:
 37 |             action = -1
 38 | 
 39 |         if action == -1:
 40 |             pass
 41 |         elif action == 0:
 42 |             self.dirnx = 0
 43 |             self.dirny = -1
 44 |             self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 45 |         elif action == 1:
 46 |             self.dirnx = 1
 47 |             self.dirny = 0
 48 |             self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 49 |         elif action == 2:
 50 |             self.dirnx = 0
 51 |             self.dirny = 1
 52 |             self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 53 |         elif action == 3:
 54 |             self.dirnx = -1
 55 |             self.dirny = 0
 56 |             self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 57 | 
 58 |         for i, c in enumerate(self.body):
 59 |             p = c.pos[:]
 60 |             if p in self.turns:
 61 |                 turn = self.turns[p]
 62 |                 c.move(turn[0],turn[1])
 63 |                 if i == len(self.body)-1:
 64 |                     self.turns.pop(p)
 65 |             else:
 66 |                 if c.dirnx == -1 and c.pos[0] <= 0: c.pos = (c.rows-1, c.pos[1])
 67 |                 elif c.dirnx == 1 and c.pos[0] >= c.rows-1: c.pos = (0,c.pos[1])
 68 |                 elif c.dirny == 1 and c.pos[1] >= c.rows-1: c.pos = (c.pos[0], 0)
 69 |                 elif c.dirny == -1 and c.pos[1] <= 0: c.pos = (c.pos[0],c.rows-1)
 70 |                 else: c.move(c.dirnx,c.dirny)
 71 | 
 72 |     def move_human(self):
 73 |         for event in pygame.event.get():
 74 |             if event.type == pygame.QUIT:
 75 |                 pygame.quit()
 76 | 
 77 |             keys = pygame.key.get_pressed()
 78 |             for key in keys:
 79 |                 if keys[pygame.K_UP]:
 80 |                     self.dirnx = 0
 81 |                     self.dirny = -1
 82 |                     self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 83 |                 elif keys[pygame.K_RIGHT]:
 84 |                     self.dirnx = 1
 85 |                     self.dirny = 0
 86 |                     self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 87 |                 elif keys[pygame.K_DOWN]:
 88 |                     self.dirnx = 0
 89 |                     self.dirny = 1
 90 |                     self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 91 |                 elif keys[pygame.K_LEFT]:
 92 |                     self.dirnx = -1
 93 |                     self.dirny = 0
 94 |                     self.turns[self.head.pos[:]] = [self.dirnx, self.dirny]
 95 | 
 96 |         for i, c in enumerate(self.body):
 97 |             p = c.pos[:]
 98 |             if p in self.turns:
 99 |                 turn = self.turns[p]
100 |                 c.move(turn[0],turn[1])
101 |                 if i == len(self.body)-1:
102 |                     self.turns.pop(p)
103 |             else:
104 |                 if c.dirnx == -1 and c.pos[0] <= 0: c.pos = (c.rows-1, c.pos[1])
105 |                 elif c.dirnx == 1 and c.pos[0] >= c.rows-1: c.pos = (0,c.pos[1])
106 |                 elif c.dirny == 1 and c.pos[1] >= c.rows-1: c.pos = (c.pos[0], 0)
107 |                 elif c.dirny == -1 and c.pos[1] <= 0: c.pos = (c.pos[0],c.rows-1)
108 |                 else: c.move(c.dirnx,c.dirny)
109 | 
110 |     def ate_itself(self):
111 |         head = True
112 |         for i, c in enumerate(self.body):
113 |             if self.head.pos == c.pos and not head:
114 |                  return True
115 |             head = False
116 | 
117 |     def reset(self, pos):
118 |         self.head = Cube(pos, self.rows, self.w)
119 |         self.body = []
120 |         self.body.append(self.head)
121 |         self.turns = {}
122 |         self.dirnx = 0
123 |         self.dirny = 1
124 |         self.add_cube()
125 |         self.add_cube()
126 | 
127 |     def add_cube(self):
128 |         tail = self.body[-1]
129 |         dx, dy = tail.dirnx, tail.dirny
130 | 
131 |         if dx == 1 and dy == 0:
132 |             self.body.append(Cube((tail.pos[0] -1, tail.pos[1]), self.rows, self.w))
133 |         elif dx == -1 and dy == 0:
134 |             self.body.append(Cube((tail.pos[0] +1, tail.pos[1]), self.rows, self.w))
135 |         elif dx == 0 and dy == 1:
136 |             self.body.append(Cube((tail.pos[0], tail.pos[1] -1), self.rows, self.w))
137 |         elif dx == 0 and dy == -1:
138 |             self.body.append(Cube((tail.pos[0], tail.pos[1] +1), self.rows, self.w))
139 | 
140 |         self.body[-1].dirnx = dx
141 |         self.body[-1].dirny = dy
142 | 
143 |     def draw(self, surface):
144 |         for i, c in enumerate(self.body):
145 |         	if i == 0:
146 |         		c.draw(surface, True)
147 |         	else:
148 |         		c.draw(surface)
149 | 


--------------------------------------------------------------------------------
/snake/environment/apple.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # https://www.youtube.com/watch?v=AaGK-fj-BAM&t=630s
 3 | import pygame
 4 | import random
 5 | 
 6 | 
 7 | class Apple:
 8 | 
 9 |     def __init__(self, screen, s_width, s_height, color, scale):
10 | 
11 |         self.screen = screen
12 |         self.s_width = s_width
13 |         self.s_height = s_height
14 |         self.color = color
15 |         self.scale = scale
16 | 
17 |         self.place_apple(None)
18 | 
19 |     def draw(self):
20 |         rect = pygame.rect.Rect(self.x, self.y, self.scale, self.scale)
21 |         pygame.draw.rect(self.screen, self.color, rect)
22 | 
23 |     def eat(self, snake_x, snake_y, tail):
24 |         if self.x == snake_x and self.y == snake_y:
25 |             self.place_apple(tail)
26 |             return True
27 |         return False
28 | 
29 |     def place_apple(self, tail):
30 | 
31 |         cols = (self.s_width - self.scale) / self.scale
32 |         rows = (self.s_height - self.scale) / self.scale
33 | 
34 |         rand_x = 0
35 |         rand_y = 0
36 | 
37 |         bad_position = True
38 | 
39 |         if tail is None:
40 |             bad_position = False
41 |             rand_x = random.randint(0, cols)
42 |             rand_y = random.randint(0, rows)
43 | 
44 |         while bad_position:
45 |             bad_position = False
46 | 
47 |             rand_x = random.randint(0, cols)
48 |             rand_y = random.randint(0, rows)
49 | 
50 |             for i in tail:
51 |                 if rand_x == int(i.x / self.scale) and rand_y == int(i.y / self.scale):
52 |                     bad_position = True
53 |                     break
54 | 
55 |         self.x = rand_x * self.scale
56 |         self.y = rand_y * self.scale
57 | 


--------------------------------------------------------------------------------
/snake/environment/environment.py:
--------------------------------------------------------------------------------
  1 | # Game was made with the help of https://www.youtube.com/watch?v=cXgA1d_E-jY
  2 | import gym
  3 | import gym.spaces
  4 | import time
  5 | import pygame
  6 | import random
  7 | import enum
  8 | 
  9 | import numpy as np
 10 | 
 11 | from environment.snake import Snake
 12 | from environment.apple import Apple
 13 | 
 14 | # AI PARAMETERS ###############################################################
 15 | BUFFER_SIZE = 1
 16 | OBSERVATION_SIZE = 10 * 10
 17 | ACTIONS = [0, 1, 2, 3]
 18 | ACTION_SIZE = 4
 19 | 
 20 | # GAME PARAMETERS #############################################################
 21 | SCALE = 60
 22 | SCREEN_SIZE = WIDTH, HEIGHT = (600, 600)    # for 5*5 go 300*300, 60
 23 |                                             # for 10*10 go 600*600, 60
 24 | BACKGROUND = (72, 72, 72)
 25 | SNAKE_COLOR = (57, 255, 20)
 26 | APPPLE_COLOR = (255, 8, 0)
 27 | FONT = 'dyuthi'
 28 | 
 29 | """ Rewards
 30 |     1. first apple +1
 31 |     2. every next apple n+1
 32 |     3. hit wall -1
 33 |     4. ate self -2
 34 |     5. does nothing 0.1
 35 | """
 36 | """ Observations
 37 |     1. apple +1
 38 |     3. snake head = 0.5
 39 |     4. every snake body -0.01
 40 |     5. emtpy cell = -1
 41 | """
 42 | """
 43 | Interace:
 44 | reset():                resets the whole environment
 45 | step(action):           performs one action onto the environment
 46 | step_buffer(action):    performs one action onto the environment,
 47 |                         returns 4 states for experience replay
 48 | get_action_random():    obtain an imporoved random action
 49 | get_observation_size(): obtain size of observation
 50 | get_action_size():      obtain size of action
 51 | """
 52 | 
 53 | 
 54 | class Actions(enum.Enum):
 55 |     Up = 0
 56 |     Right = 1
 57 |     Down = 2
 58 |     Left = 3
 59 | 
 60 | 
 61 | class SnakeEnvironment(gym.Env):
 62 | 
 63 |     def __init__(self, draw=True, fps=100, debug=False, animation=False):
 64 | 
 65 |         super(SnakeEnvironment, self).__init__()
 66 |         self.observation_space = gym.spaces.Discrete(n=OBSERVATION_SIZE*BUFFER_SIZE)
 67 |         self.action_space = gym.spaces.Discrete(n=len(Actions))
 68 | 
 69 |         if draw:
 70 |             pygame.init()
 71 |             pygame.display.set_caption('NN Snake')
 72 |             self.font_game_over = pygame.font.SysFont("ani", 72)
 73 | 
 74 |         self.draw = draw
 75 |         self.fps = fps
 76 |         self.debug = debug
 77 |         self.animation = animation
 78 |         self.screen = pygame.display.set_mode(SCREEN_SIZE)
 79 | 
 80 |         self.reward = 0
 81 |         self.score = 0
 82 |         self.is_done = False
 83 |         self.steps_without_apple = 0
 84 | 
 85 |         self.current_observation = None
 86 |         self.last_observation = None
 87 | 
 88 |     # ML INTERFACE ############################################################
 89 |     def reset(self):
 90 |         """ Resets the whole environment. Must be called in the beginning. """
 91 | 
 92 |         self.snake = Snake(self.screen, WIDTH, HEIGHT, SNAKE_COLOR,
 93 |                            BACKGROUND, SCALE)
 94 |         self.apple = Apple(self.screen, WIDTH, HEIGHT, APPPLE_COLOR, SCALE)
 95 | 
 96 |         self.reward = 0
 97 |         self.score = 0
 98 |         self.is_done = False
 99 |         self.steps_without_apple = 0
100 | 
101 |         self.current_observation = None
102 |         self.last_observation = None
103 | 
104 |         obs, reward, is_done, _ = self.step(1)
105 | 
106 |         if self.draw:
107 |             self.countdown()
108 | 
109 |         return obs
110 | 
111 |     # The actual game step ####################################################
112 |     def step(self, action):
113 | 
114 |         print(action)
115 | 
116 |         # if isinstance(action, np.ndarray):
117 |         #     idx = -1
118 |         #     highest_idx = 0
119 |         #     highest_val = -1
120 |         #     for i in action:
121 |         #         idx += 1
122 |         #         if i > highest_val:
123 |         #             highest_idx = idx
124 |         #             highest_val = i
125 |         #     action = highest_idx
126 | 
127 |         current_reward = 0
128 | 
129 |         self.snake.handle_events_ai(action)
130 | 
131 |         if self.apple.eat(self.snake.x, self.snake.y, self.snake.tail):
132 |             self.snake.update(True)
133 |             self.steps_without_apple = 0
134 |             self.score += 1
135 |             current_reward = 1
136 |             # if self.score == 10:
137 |             #     current_reward = 1
138 |             # else:
139 |             #     current_reward = self.score / 10
140 |         else:
141 |             self.snake.update(False)
142 |             current_reward = 0.1
143 |             self.steps_without_apple += 1
144 |             # if self.steps_without_apple > 20:
145 |             #     current_reward = 0
146 |             if self.steps_without_apple > 500:
147 |                 current_reward = -1
148 |                 self.game_over()
149 | 
150 |         if self.snake.check_if_hit_wall():
151 |             current_reward = -1
152 |             self.game_over()
153 | 
154 |         if self.snake.check_if_ate_self():
155 |             current_reward = -1
156 |             self.game_over()
157 | 
158 |         if self.draw:
159 |             self.screen.fill(BACKGROUND)
160 |             self.snake.draw()
161 |             self.apple.draw()
162 |             pygame.display.update()
163 | 
164 |         obs = self.get_observation_space()
165 |         time.sleep(self.fps / 1000.0)
166 | 
167 |         return obs, current_reward, self.is_done, None
168 | 
169 |     def get_observation_space(self):
170 | 
171 |         new_obs = []
172 | 
173 |         # create 2d matrix
174 |         for i in range(int(WIDTH / SCALE)):
175 |             new_obs.append([])
176 |             for j in range(int(WIDTH / SCALE)):
177 |                 new_obs[i].append(-1)
178 | 
179 |         # add apple
180 |         x_apple = int(self.apple.x / SCALE)
181 |         y_apple = int(self.apple.y / SCALE)
182 |         new_obs[y_apple][x_apple] = 1
183 | 
184 |         # add snake
185 |         x_snake = int(self.snake.x / SCALE)
186 |         y_snake = int(self.snake.y / SCALE)
187 |         new_obs[y_snake][x_snake] = 0.8
188 | 
189 |         # tail
190 |         for i in self.snake.tail:
191 |             x_snake = int(i.x / SCALE)
192 |             y_snake = int(i.y / SCALE)
193 |             new_obs[y_snake][x_snake] = 0.5
194 | 
195 |         current_obs = []
196 |         for i in new_obs:
197 |             for j in i:
198 |                 current_obs.append(j)
199 | 
200 |         if self.draw and self.debug:
201 |             for i in new_obs:
202 |                 print(i, '\n')
203 |             print('\n')
204 | 
205 |         return_obs = np.array(current_obs)
206 | 
207 |         #######
208 |         # if self.last_observation == None:
209 |         #     self.last_observation = current_obs
210 | 
211 |         # return_obs = []
212 | 
213 |         # for i in self.last_observation:
214 |         #     return_obs.append(i)
215 |         # for i in current_obs:
216 |         #     return_obs.append(i)
217 | 
218 |         # return_obs = np.array(return_obs)
219 | 
220 |         # cnt = 0
221 |         # for i in return_obs:
222 |         #     cnt += 1
223 |         #     print(' ', i, ' ', end='')
224 |         #     if cnt % 10 == 0:
225 |         #         print('')
226 |         #     if cnt % 100 == 0:
227 |         #         print('')
228 |         #         print('')
229 |         # print('')
230 | 
231 |         # self.last_observation = current_obs
232 |         #######
233 | 
234 |         return return_obs
235 | 
236 |     def get_action_random(self):
237 |         return random.randint(0, 3)
238 | 
239 |     # HUMAN STUFF ############################################################
240 | 
241 |     def reset_human_game(self):
242 |         """ Resets the whole environment. Must be called in the beginning. """
243 | 
244 |         self.clock = pygame.time.Clock()
245 |         self.time_elapsed_since_last_action = 0
246 |         self.global_time = 0
247 | 
248 |         self.screen = pygame.display.set_mode(SCREEN_SIZE)
249 |         self.snake = Snake(self.screen, WIDTH, HEIGHT, SNAKE_COLOR,
250 |                            BACKGROUND, SCALE)
251 |         self.apple = Apple(self.screen, WIDTH, HEIGHT, APPPLE_COLOR, SCALE)
252 | 
253 |         self.reward = 0
254 |         self.score = 0
255 |         self.is_done = False
256 |         self.steps_without_apple = 0
257 | 
258 |         self.current_observation = None
259 |         self.last_observation = None
260 | 
261 |         if self.draw:
262 |             self.countdown()
263 | 
264 |     def run_human_game(self):
265 | 
266 |         while not self.is_done:
267 | 
268 |             self.handle_events_human()
269 |             self.snake.handle_events_human()
270 | 
271 |             if self.apple.eat(self.snake.x, self.snake.y, self.snake.tail):
272 |                 self.snake.update(True)
273 |             else:
274 |                 self.snake.update(False)
275 | 
276 |             if self.snake.check_if_hit_wall():
277 |                 self.game_over()
278 | 
279 |             if self.snake.check_if_ate_self():
280 |                 self.game_over()
281 | 
282 |             if self.draw:
283 |                 self.screen.fill(BACKGROUND)
284 |                 self.snake.draw()
285 |                 self.apple.draw()
286 |                 pygame.display.update()
287 | 
288 |             time.sleep (self.fps / 1000.0);
289 | 
290 |     def handle_events_human(self):
291 |         for event in pygame.event.get():
292 |             if event.type == pygame.QUIT:
293 |                 self.is_done = False
294 |                 pygame.quit()
295 | 
296 |     def countdown(self):
297 |         if not self.animation:
298 |             return
299 |         for _ in range(3, 0, -1):
300 |             self.screen.fill(BACKGROUND)
301 |             self.snake.draw()
302 |             self.apple.draw()
303 |             text_start = pygame.font.SysFont(FONT, 80). \
304 |                 render("Start in  {}".format(_), True, (0, 0, 0))
305 |             self.screen.blit(text_start,
306 |                              (text_start.get_width() //
307 |                               2, text_start.get_height() // 2))
308 |             pygame.display.flip()
309 |             time.sleep(0.5)
310 | 
311 |     def game_over(self):
312 |         self.is_done = True
313 |         if not self.animation:
314 |             return
315 |         if self.draw:
316 |             text = pygame.font.SysFont(FONT, 28).render(
317 |                 "Game Over!".format(self.reward), True, (0, 0, 0))
318 |             self.screen.blit(text, (320 - text.get_width() //
319 |                                     2, 240 - text.get_height() // 2))
320 |             pygame.display.flip()
321 |             time.sleep(0.5)
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 |         # if self.last_observation == None:
329 |         #     self.current_observation = current_obs
330 | 
331 |         # self.last_observation = self.current_observation
332 |         # self.current_observation = current_obs
333 | 
334 |         # return_obs = []
335 | 
336 |         # for i in self.last_observation:
337 |         #     return_obs.append(i)
338 | 
339 |         # for i in self.current_observation:
340 |         #     return_obs.append(i)
341 | 
342 |         # current_obs = np.array(current_obs)
343 | 
344 |         # for i in range(25):
345 |         #     if i%5==0:
346 |         #         print('')
347 |         #     print(' ' , self.last_observation[i] , ' ' , end='')
348 | 
349 |         # print('')
350 |         # for i in range(25):
351 |         #     if i%5==0:
352 |         #         print('')
353 |         #     print(' ' ,self.current_observation[i], ' ' , end='')
354 | 


--------------------------------------------------------------------------------
/snake/environment/snake.py:
--------------------------------------------------------------------------------
  1 | # https://www.youtube.com/watch?v=AaGK-fj-BAM&t=630s
  2 | import pygame
  3 | 
  4 | 
  5 | class Snake:
  6 | 
  7 |     def __init__(self, screen, s_width, s_height, color, body_color, scale):
  8 | 
  9 |         self.screen = screen
 10 |         self.s_width = s_width
 11 |         self.s_height = s_height
 12 |         self.color = color
 13 |         self.body_color = body_color
 14 |         self.scale = scale
 15 | 
 16 |         self.scale = scale
 17 | 
 18 |         self.x = 2 * scale
 19 |         self.y = 2 * scale
 20 | 
 21 |         self.x_speed = 1
 22 |         self.y_speed = 0
 23 | 
 24 |         self.tail = [Vector(self.x, self.y)]
 25 | 
 26 |     def handle_events_human(self):
 27 |         keys = pygame.key.get_pressed()
 28 |         if keys[pygame.K_UP]:
 29 |             self.move(0, -1)
 30 |         if keys[pygame.K_RIGHT]:
 31 |             self.move(1, 0)
 32 |         if keys[pygame.K_DOWN]:
 33 |             self.move(0, 1)
 34 |         if keys[pygame.K_LEFT]:
 35 |             self.move(-1, 0)
 36 | 
 37 |     def handle_events_ai(self, action):
 38 |         # print(action)
 39 |         if action == 0:
 40 |             self.move(0, -1)
 41 |         if action == 1:
 42 |             self.move(1, 0)
 43 |         if action == 2:
 44 |             self.move(0, 1)
 45 |         if action == 3:
 46 |             self.move(-1, 0)
 47 | 
 48 |     def draw(self):
 49 | 
 50 |         for i in self.tail:
 51 |             rect = pygame.rect.Rect(
 52 |                 i.x + 1, i.y + 1, self.scale - 2, self.scale - 2)
 53 |             pygame.draw.rect(self.screen, self.color, rect)
 54 |             rect = pygame.rect.Rect(
 55 |                 i.x + 16, i.y + 16, self.scale - 32, self.scale - 32)
 56 |             pygame.draw.rect(self.screen, self.body_color, rect)
 57 | 
 58 |         rect = pygame.rect.Rect(
 59 |             self.x, self.y, self.scale, self.scale)
 60 |         pygame.draw.rect(self.screen, self.color, rect)
 61 | 
 62 |     def update(self, ate_apple):
 63 | 
 64 |         length = len(self.tail)
 65 | 
 66 |         if ate_apple:
 67 |             self.tail.append(Vector(self.x, self.y))
 68 |         else:
 69 |             for i in range(length - 1):
 70 |                 self.tail[i] = self.tail[i + 1]
 71 |             self.tail[length - 1] = Vector(self.x, self.y)
 72 | 
 73 |         self.x = self.x + self.x_speed * self.scale
 74 |         self.y = self.y + self.y_speed * self.scale
 75 | 
 76 |         if self.x < 0:
 77 |             self.x = 0
 78 |         if self.x > self.s_width - self.scale:
 79 |             self.x = self.s_width - self.scale
 80 |         if self.y < 0:
 81 |             self.y = 0
 82 |         if self.y > self.s_height - self.scale:
 83 |             self.y = self.s_height - self.scale
 84 | 
 85 |     def move(self, x, y):
 86 |         self.x_speed = x
 87 |         self.y_speed = y
 88 | 
 89 |     def check_if_hit_wall(self):
 90 |         if self.x == -1:
 91 |             return True
 92 |         if self.x == self.s_width:
 93 |             return True
 94 |         if self.y == -1:
 95 |             return True
 96 |         if self.y == self.s_height:
 97 |             return True
 98 | 
 99 |     def check_if_ate_self(self):
100 |         for i in self.tail:
101 |             if (self.x == i.x) and (self.y == i.y):
102 |                 return True
103 | 
104 | 
105 | class Vector:
106 | 
107 |     def __init__(self, x, y):
108 |         self.x = x
109 |         self.y = y
110 | 


--------------------------------------------------------------------------------
/snake/lib/common.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | HYPERPARAMS = {
  9 |     'snake': {
 10 |         'stop_reward':      40.0,
 11 |         'run_name':         'snake',
 12 |         'replay_size':      1000000,
 13 |         'replay_initial':   100000,
 14 |         'target_net_sync':  1000,
 15 |         'epsilon_frames':   10**6,
 16 |         'epsilon_start':    1.0,
 17 |         'epsilon_final':    0.02,
 18 |         'learning_rate':    0.002,
 19 |         'gamma':            0.9,
 20 |         'batch_size':       32
 21 |     }
 22 | }
 23 | 
 24 | 
 25 | def unpack_batch(batch):
 26 |     states, actions, rewards, dones, last_states = [], [], [], [], []
 27 |     for exp in batch:
 28 |         state = np.array(exp.state, copy=False)
 29 |         states.append(state)
 30 |         actions.append(exp.action)
 31 |         rewards.append(exp.reward)
 32 |         dones.append(exp.last_state is None)
 33 |         if exp.last_state is None:
 34 |             last_states.append(state)       # the result will be masked anyway
 35 |         else:
 36 |             last_states.append(np.array(exp.last_state, copy=False))
 37 |     return np.array(states, copy=False), np.array(actions), np.array(rewards, dtype=np.float32), \
 38 |            np.array(dones, dtype=np.uint8), np.array(last_states, copy=False)
 39 | 
 40 | 
 41 | def calc_loss_dqn(batch, net, tgt_net, gamma, device="cpu"):
 42 |     states, actions, rewards, dones, next_states = unpack_batch(batch)
 43 | 
 44 |     states_v = torch.tensor(states).to(device)
 45 |     next_states_v = torch.tensor(next_states).to(device)
 46 |     actions_v = torch.tensor(actions).to(device)
 47 |     rewards_v = torch.tensor(rewards).to(device)
 48 |     done_mask = torch.ByteTensor(dones).to(device)
 49 | 
 50 |     state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
 51 |     next_state_values = tgt_net(next_states_v).max(1)[0]
 52 |     next_state_values[done_mask] = 0.0
 53 | 
 54 |     expected_state_action_values = next_state_values.detach() * gamma + rewards_v
 55 |     return nn.MSELoss()(state_action_values, expected_state_action_values)
 56 | 
 57 | 
 58 | class RewardTracker:
 59 |     def __init__(self, net, writer, stop_reward):
 60 |         self.writer = writer
 61 |         self.stop_reward = stop_reward
 62 |         self.net = net
 63 |         self.best_reward = -1
 64 | 
 65 |     def __enter__(self):
 66 |         self.ts = time.time()
 67 |         self.ts_frame = 0
 68 |         self.total_rewards = []
 69 |         return self
 70 | 
 71 |     def __exit__(self, *args):
 72 |         if self.writer != None:
 73 |             self.writer.close()
 74 | 
 75 |     def reward(self, reward, frame, epsilon=None):
 76 |         self.total_rewards.append(reward)
 77 |         speed = (frame - self.ts_frame) / (time.time() - self.ts)
 78 |         self.ts_frame = frame
 79 |         self.ts = time.time()
 80 |         mean_reward = np.mean(self.total_rewards[-100:])
 81 |         epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
 82 |         print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % (
 83 |             frame, len(self.total_rewards), mean_reward, speed, epsilon_str
 84 |         ))
 85 |         sys.stdout.flush()
 86 |         if self.writer != None:
 87 |             if epsilon is not None:
 88 |                 self.writer.add_scalar("epsilon", epsilon, frame)
 89 |             self.writer.add_scalar("speed", speed, frame)
 90 |             self.writer.add_scalar("reward_100", mean_reward, frame)
 91 |             self.writer.add_scalar("reward", reward, frame)
 92 |         if reward > self.best_reward:
 93 |             self.best_reward = reward
 94 |             torch.save(self.net.state_dict(), 'models/best-snake-model-' + str(int(reward)))
 95 |             print("\tNew best reward = ", str(reward))
 96 |         if mean_reward > self.stop_reward:
 97 |             print("Solved in %d frames!" % frame)
 98 |             return True
 99 |         return False
100 | 
101 | 
102 | class EpsilonTracker:
103 |     def __init__(self, epsilon_greedy_selector, params):
104 |         self.epsilon_greedy_selector = epsilon_greedy_selector
105 |         self.epsilon_start = params['epsilon_start']
106 |         self.epsilon_final = params['epsilon_final']
107 |         self.epsilon_frames = params['epsilon_frames']
108 |         self.frame(0)
109 | 
110 |     def frame(self, frame):
111 |         self.epsilon_greedy_selector.epsilon = \
112 |             max(self.epsilon_final, self.epsilon_start - frame / self.epsilon_frames)
113 | 
114 | 
115 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma):
116 |     """
117 |     Perform distribution projection aka Catergorical Algorithm from the
118 |     "A Distributional Perspective on RL" paper
119 |     """
120 |     batch_size = len(rewards)
121 |     proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32)
122 |     delta_z = (Vmax - Vmin) / (n_atoms - 1)
123 |     for atom in range(n_atoms):
124 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma))
125 |         b_j = (tz_j - Vmin) / delta_z
126 |         l = np.floor(b_j).astype(np.int64)
127 |         u = np.ceil(b_j).astype(np.int64)
128 |         eq_mask = u == l
129 |         proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom]
130 |         ne_mask = u != l
131 |         proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask]
132 |         proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask]
133 |     if dones.any():
134 |         proj_distr[dones] = 0.0
135 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones]))
136 |         b_j = (tz_j - Vmin) / delta_z
137 |         l = np.floor(b_j).astype(np.int64)
138 |         u = np.ceil(b_j).astype(np.int64)
139 |         eq_mask = u == l
140 |         eq_dones = dones.copy()
141 |         eq_dones[dones] = eq_mask
142 |         if eq_dones.any():
143 |             proj_distr[eq_dones, l[eq_mask]] = 1.0
144 |         ne_mask = u != l
145 |         ne_dones = dones.copy()
146 |         ne_dones[dones] = ne_mask
147 |         if ne_dones.any():
148 |             proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask]
149 |             proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask]
150 |     return proj_distr


--------------------------------------------------------------------------------
/snake/lib/dqn_model.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | class NoisyLinear(nn.Linear):
10 |     def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
11 |         super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
12 |         self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
13 |         self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features))
14 |         if bias:
15 |             self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
16 |             self.register_buffer("epsilon_bias", torch.zeros(out_features))
17 |         self.reset_parameters()
18 | 
19 |     def reset_parameters(self):
20 |         std = math.sqrt(3 / self.in_features)
21 |         self.weight.data.uniform_(-std, std)
22 |         self.bias.data.uniform_(-std, std)
23 | 
24 |     def forward(self, input):
25 |         self.epsilon_weight.normal_()
26 |         bias = self.bias
27 |         if bias is not None:
28 |             self.epsilon_bias.normal_()
29 |             bias = bias + self.sigma_bias * self.epsilon_bias.data
30 |         return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias)
31 | 
32 | 
33 | class NoisyFactorizedLinear(nn.Linear):
34 |     """
35 |     NoisyNet layer with factorized gaussian noise
36 | 
37 |     N.B. nn.Linear already initializes weight and bias to
38 |     """
39 |     def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True):
40 |         super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias)
41 |         sigma_init = sigma_zero / math.sqrt(in_features)
42 |         self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
43 |         self.register_buffer("epsilon_input", torch.zeros(1, in_features))
44 |         self.register_buffer("epsilon_output", torch.zeros(out_features, 1))
45 |         if bias:
46 |             self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
47 | 
48 |     def forward(self, input):
49 |         self.epsilon_input.normal_()
50 |         self.epsilon_output.normal_()
51 | 
52 |         func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x))
53 |         eps_in = func(self.epsilon_input.data)
54 |         eps_out = func(self.epsilon_output.data)
55 | 
56 |         bias = self.bias
57 |         if bias is not None:
58 |             bias = bias + self.sigma_bias * eps_out.t()
59 |         noise_v = torch.mul(eps_in, eps_out)
60 |         return F.linear(input, self.weight + self.sigma_weight * noise_v, bias)
61 | 
62 | 
63 | class DQN(nn.Module):
64 |     def __init__(self, input_shape, n_actions):
65 |         super(DQN, self).__init__()
66 | 
67 |         self.fc = nn.Sequential(
68 |             nn.Linear(input_shape, 512),
69 |             nn.ReLU(),
70 |             nn.Linear(512, n_actions)
71 |         )
72 | 
73 |     def forward(self, x):
74 |         fx = x.float() / 256
75 |         return self.fc(fx)


--------------------------------------------------------------------------------
/snake/lib/dqn_rainbow.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from environment.environment import Environment
  3 | import ptan
  4 | import argparse
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | import torch.optim as optim
 12 | 
 13 | from tensorboardX import SummaryWriter
 14 | 
 15 | import dqn as dqn_model
 16 | import common
 17 | 
 18 | # n-step
 19 | REWARD_STEPS = 2
 20 | 
 21 | # priority replay
 22 | PRIO_REPLAY_ALPHA = 0.6
 23 | BETA_START = 0.4
 24 | BETA_FRAMES = 100000
 25 | 
 26 | # C51
 27 | Vmax = 10
 28 | Vmin = -10
 29 | N_ATOMS = 51
 30 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
 31 | 
 32 | 
 33 | class RainbowDQN(nn.Module):
 34 |     def __init__(self, input_shape, n_actions):
 35 |         super(RainbowDQN, self).__init__()
 36 | 
 37 |         self.fc_val = nn.Sequential(
 38 |             dqn_model.NoisyLinear(input_shape[0], 256),
 39 |             nn.ReLU(),
 40 |             dqn_model.NoisyLinear(256, N_ATOMS)
 41 |         )
 42 | 
 43 |         self.fc_adv = nn.Sequential(
 44 |             dqn_model.NoisyLinear(input_shape[0], 256),
 45 |             nn.ReLU(),
 46 |             dqn_model.NoisyLinear(256, n_actions * N_ATOMS)
 47 |         )
 48 | 
 49 |         self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z))
 50 |         self.softmax = nn.Softmax(dim=1)
 51 | 
 52 |     def forward(self, x):
 53 |         batch_size = x.size()[0]
 54 |         fx = x.float() / 256
 55 |         val_out = self.fc_val(fx).view(batch_size, 1, N_ATOMS)
 56 |         adv_out = self.fc_adv(fx).view(batch_size, -1, N_ATOMS)
 57 |         adv_mean = adv_out.mean(dim=1, keepdim=True)
 58 |         return val_out + (adv_out - adv_mean)
 59 | 
 60 |     def both(self, x):
 61 |         cat_out = self(x)
 62 |         probs = self.apply_softmax(cat_out)
 63 |         weights = probs * self.supports
 64 |         res = weights.sum(dim=2)
 65 |         return cat_out, res
 66 | 
 67 |     def qvals(self, x):
 68 |         return self.both(x)[1]
 69 | 
 70 |     def apply_softmax(self, t):
 71 |         return self.softmax(t.view(-1, N_ATOMS)).view(t.size())
 72 | 
 73 | 
 74 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
 75 |     states, actions, rewards, dones, next_states = common.unpack_batch(batch)
 76 |     batch_size = len(batch)
 77 | 
 78 |     states_v = torch.tensor(states).to(device)
 79 |     actions_v = torch.tensor(actions).to(device)
 80 |     next_states_v = torch.tensor(next_states).to(device)
 81 |     batch_weights_v = torch.tensor(batch_weights).to(device)
 82 | 
 83 |     # next state distribution
 84 |     # dueling arch -- actions from main net, distr from tgt_net
 85 | 
 86 |     # calc at once both next and cur states
 87 |     distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
 88 |     next_qvals_v = qvals_v[batch_size:]
 89 |     distr_v = distr_v[:batch_size]
 90 | 
 91 |     next_actions_v = next_qvals_v.max(1)[1]
 92 |     next_distr_v = tgt_net(next_states_v)
 93 |     next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
 94 |     next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
 95 |     next_best_distr = next_best_distr_v.data.cpu().numpy()
 96 | 
 97 |     dones = dones.astype(np.bool)
 98 | 
 99 |     # project our distribution using Bellman update
100 |     proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma)
101 | 
102 |     # calculate net output
103 |     state_action_values = distr_v[range(batch_size), actions_v.data]
104 |     state_log_sm_v = F.log_softmax(state_action_values, dim=1)
105 |     proj_distr_v = torch.tensor(proj_distr).to(device)
106 | 
107 |     loss_v = -state_log_sm_v * proj_distr_v
108 |     loss_v = batch_weights_v * loss_v.sum(dim=1)
109 |     return loss_v.mean(), loss_v + 1e-5
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     params = common.HYPERPARAMS['pong']
114 |     params['epsilon_frames'] *= 2
115 |     parser = argparse.ArgumentParser()
116 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
117 |     args = parser.parse_args()
118 |     device = torch.device("cuda" if args.cuda else "cpu")
119 | 
120 |     env = gym.make(params['env_name'])
121 |     env = ptan.common.wrappers.wrap_dqn(env)
122 | 
123 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow")
124 |     net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device)
125 |     tgt_net = ptan.agent.TargetNet(net)
126 |     agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device)
127 | 
128 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS)
129 |     buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
130 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
131 | 
132 |     frame_idx = 0
133 |     beta = BETA_START
134 | 
135 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
136 |         while True:
137 |             frame_idx += 1
138 |             buffer.populate(1)
139 |             beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)
140 | 
141 |             new_rewards = exp_source.pop_total_rewards()
142 |             if new_rewards:
143 |                 if reward_tracker.reward(new_rewards[0], frame_idx):
144 |                     break
145 | 
146 |             if len(buffer) < params['replay_initial']:
147 |                 continue
148 | 
149 |             optimizer.zero_grad()
150 |             batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta)
151 |             loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model,
152 |                                                params['gamma'] ** REWARD_STEPS, device=device)
153 |             loss_v.backward()
154 |             optimizer.step()
155 |             buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy())
156 | 
157 |             if frame_idx % params['target_net_sync'] == 0:
158 |                 tgt_net.sync()
159 | 


--------------------------------------------------------------------------------
/snake/lib/ppo_model.py:
--------------------------------------------------------------------------------
 1 | import ptan
 2 | import numpy as np
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | HID_SIZE = 1024
 7 | 
 8 | 
 9 | class ModelActor(nn.Module):
10 |     def __init__(self, obs_size, act_size):
11 |         super(ModelActor, self).__init__()
12 | 
13 |         self.mu = nn.Sequential(
14 |             nn.Linear(obs_size, HID_SIZE),
15 |             nn.Tanh(),
16 |             nn.Linear(HID_SIZE, HID_SIZE),
17 |             nn.Tanh(),
18 |             nn.Linear(HID_SIZE, act_size),
19 |             nn.Tanh(),
20 |         )
21 |         self.logstd = nn.Parameter(torch.zeros(act_size))
22 | 
23 |     def forward(self, x):
24 |         return self.mu(x)
25 | 
26 | 
27 | class ModelCritic(nn.Module):
28 |     def __init__(self, obs_size):
29 |         super(ModelCritic, self).__init__()
30 | 
31 |         self.value = nn.Sequential(
32 |             nn.Linear(obs_size, HID_SIZE),
33 |             nn.ReLU(),
34 |             nn.Linear(HID_SIZE, HID_SIZE),
35 |             nn.ReLU(),
36 |             nn.Linear(HID_SIZE, 1),
37 |         )
38 | 
39 |     def forward(self, x):
40 |         return self.value(x)
41 | 
42 | 
43 | class AgentA2C(ptan.agent.BaseAgent):
44 |     def __init__(self, net, device="cpu"):
45 |         self.net = net
46 |         self.device = device
47 | 
48 |     def __call__(self, states, agent_states):
49 |         states_v = ptan.agent.float32_preprocessor(states).to(self.device)
50 | 
51 |         mu_v = self.net(states_v)
52 |         mu = mu_v.data.cpu().numpy()
53 |         logstd = self.net.logstd.data.cpu().numpy()
54 |         actions = mu + np.exp(logstd) * np.random.normal(size=logstd.shape)
55 |         actions = np.clip(actions, -1, 1)
56 |         return actions, agent_states
57 | 


--------------------------------------------------------------------------------
/snake/play_ppo.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import ptan
 4 | 
 5 | from lib import ppo_model as model
 6 | from environment.environment import SnakeEnvironment
 7 | 
 8 | MODEL_NAME = "best_+1.000_153000.dat"
 9 | 
10 | env = SnakeEnvironment(draw=True, speed=15, rows=5, animation=True)
11 | 
12 | net_act = model.ModelActor(env.observation_space.n,
13 |                            env.action_space.n).to("cpu")
14 | net_act.load_state_dict(torch.load("saves/ppo-test-snake/" + MODEL_NAME, map_location=lambda storage, loc: storage))    
15 | 
16 | rewards = 0.0
17 | steps = 0
18 | for _ in range(5):
19 |     obs = env.reset()
20 |     while True:
21 |         obs_v = ptan.agent.float32_preprocessor([obs]).to("cpu")
22 |         mu_v = net_act(obs_v)[0]
23 |         action = mu_v.squeeze(dim=0).data.cpu().numpy()
24 |         action = np.clip(action, -1, 1)
25 |         obs, reward, done, _ = env.step(action)
26 |         rewards += reward
27 |         steps += 1
28 |         if done:
29 |             break
30 | 


--------------------------------------------------------------------------------
/snake/ppo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import math
  4 | import ptan
  5 | import time
  6 | import gym
  7 | import argparse
  8 | from tensorboardX import SummaryWriter
  9 | 
 10 | from lib import ppo_model as model
 11 | from environment.environment import SnakeEnvironment
 12 | 
 13 | import numpy as np
 14 | import torch
 15 | import torch.optim as optim
 16 | import torch.nn.functional as F
 17 | 
 18 | ENV = SnakeEnvironment(draw=True, fps=100, debug=False, animation=False)
 19 | TEST_ENV = SnakeEnvironment( draw=False, fps=100, debug=False, animation=False)
 20 | 
 21 | GAMMA = 0.99
 22 | GAE_LAMBDA = 0.95
 23 | 
 24 | TRAJECTORY_SIZE = 2049
 25 | LEARNING_RATE_ACTOR = 1e-3
 26 | LEARNING_RATE_CRITIC = 1e-2
 27 | 
 28 | PPO_EPS = 0.2
 29 | PPO_EPOCHES = 10
 30 | PPO_BATCH_SIZE = 64
 31 | 
 32 | TEST_ITERS = 1000
 33 | 
 34 | 
 35 | def test_net(net, env, count=10, device="cpu"):
 36 |     rewards = 0.0
 37 |     steps = 0
 38 |     for _ in range(count):
 39 |         obs = env.reset()
 40 |         while True:
 41 |             obs_v = ptan.agent.float32_preprocessor([obs]).to(device)
 42 |             mu_v = net(obs_v)[0]
 43 |             action = mu_v.squeeze(dim=0).data.cpu().numpy()
 44 |             action = np.clip(action, -1, 1)
 45 |             obs, reward, done, _ = env.step(action)
 46 |             rewards += reward
 47 |             steps += 1
 48 |             if done:
 49 |                 break
 50 |     return rewards / count, steps / count
 51 | 
 52 | 
 53 | def calc_logprob(mu_v, logstd_v, actions_v):
 54 |     p1 = - ((mu_v - actions_v) ** 2) / (2*torch.exp(logstd_v).clamp(min=1e-3))
 55 |     p2 = - torch.log(torch.sqrt(2 * math.pi * torch.exp(logstd_v)))
 56 |     return p1 + p2
 57 | 
 58 | 
 59 | def calc_adv_ref(trajectory, net_crt, states_v, device="cpu"):
 60 |     """
 61 |     By trajectory calculate advantage and 1-step ref value
 62 |     :param trajectory: trajectory list
 63 |     :param net_crt: critic network
 64 |     :param states_v: states tensor
 65 |     :return: tuple with advantage numpy array and reference values
 66 |     """
 67 |     values_v = net_crt(states_v)
 68 |     values = values_v.squeeze().data.cpu().numpy()
 69 |     # generalized advantage estimator: smoothed version of the advantage
 70 |     last_gae = 0.0
 71 |     result_adv = []
 72 |     result_ref = []
 73 |     for val, next_val, (exp,) in zip(reversed(values[:-1]), reversed(values[1:]),
 74 |                                      reversed(trajectory[:-1])):
 75 |         if exp.done:
 76 |             delta = exp.reward - val
 77 |             last_gae = delta
 78 |         else:
 79 |             delta = exp.reward + GAMMA * next_val - val
 80 |             last_gae = delta + GAMMA * GAE_LAMBDA * last_gae
 81 |         result_adv.append(last_gae)
 82 |         result_ref.append(last_gae + val)
 83 | 
 84 |     adv_v = torch.FloatTensor(list(reversed(result_adv))).to(device)
 85 |     ref_v = torch.FloatTensor(list(reversed(result_ref))).to(device)
 86 |     return adv_v, ref_v
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     # parser = argparse.ArgumentParser()
 91 |     # parser.add_argument("--cuda", default=False, action='store_true', help='Enable CUDA')
 92 |     # parser.add_argument("-n", "--name", required=True, help="Name of the run")
 93 |     # parser.add_argument("-e", "--env", default=ENV_ID, help="Environment id, default=" + ENV_ID)
 94 |     # args = parser.parse_args()
 95 | 
 96 |     name = "test-snake"
 97 | 
 98 |     # device = torch.device("cuda" if args.cuda else "cpu")
 99 |     device = torch.device("cpu")
100 | 
101 |     save_path = os.path.join("saves", "ppo-" + name)
102 |     os.makedirs(save_path, exist_ok=True)
103 | 
104 |     env = ENV
105 |     test_env = TEST_ENV
106 | 
107 |     net_act = model.ModelActor(env.observation_space.n, env.action_space.n).to(device)
108 |     net_crt = model.ModelCritic(env.observation_space.n).to(device)
109 |     print(net_act)
110 |     print(net_crt)
111 | 
112 |     writer = SummaryWriter(comment="-ppo_" + name)
113 |     agent = model.AgentA2C(net_act, device=device)
114 |     exp_source = ptan.experience.ExperienceSource(env, agent, steps_count=1)
115 | 
116 |     opt_act = optim.Adam(net_act.parameters(), lr=LEARNING_RATE_ACTOR)
117 |     opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC)
118 | 
119 |     trajectory = []
120 |     best_reward = None
121 |     with ptan.common.utils.RewardTracker(writer) as tracker:
122 |         for step_idx, exp in enumerate(exp_source):
123 |             rewards_steps = exp_source.pop_rewards_steps()
124 |             if rewards_steps:
125 |                 rewards, steps = zip(*rewards_steps)
126 |                 writer.add_scalar("episode_steps", np.mean(steps), step_idx)
127 |                 tracker.reward(np.mean(rewards), step_idx)
128 | 
129 |             if step_idx % TEST_ITERS == 0:
130 |                 ts = time.time()
131 |                 rewards, steps = test_net(net_act, test_env, device=device)
132 |                 print("Test done in %.2f sec, reward %.3f, steps %d" % (
133 |                     time.time() - ts, rewards, steps))
134 |                 writer.add_scalar("test_reward", rewards, step_idx)
135 |                 writer.add_scalar("test_steps", steps, step_idx)
136 |                 if best_reward is None or best_reward < rewards:
137 |                     if best_reward is not None:
138 |                         print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards))
139 |                         name = "best_%+.3f_%d.dat" % (rewards, step_idx)
140 |                         fname = os.path.join(save_path, name)
141 |                         torch.save(net_act.state_dict(), fname)
142 |                     best_reward = rewards
143 | 
144 |             trajectory.append(exp)
145 |             if len(trajectory) < TRAJECTORY_SIZE:
146 |                 continue
147 | 
148 |             traj_states = [t[0].state for t in trajectory]
149 |             traj_actions = [t[0].action for t in trajectory]
150 |             traj_states_v = torch.FloatTensor(traj_states).to(device)
151 |             traj_actions_v = torch.FloatTensor(traj_actions).to(device)
152 |             traj_adv_v, traj_ref_v = calc_adv_ref(trajectory, net_crt, traj_states_v, device=device)
153 |             mu_v = net_act(traj_states_v)
154 |             old_logprob_v = calc_logprob(mu_v, net_act.logstd, traj_actions_v)
155 | 
156 |             # normalize advantages
157 |             traj_adv_v = (traj_adv_v - torch.mean(traj_adv_v)) / torch.std(traj_adv_v)
158 | 
159 |             # drop last entry from the trajectory, an our adv and ref value calculated without it
160 |             trajectory = trajectory[:-1]
161 |             old_logprob_v = old_logprob_v[:-1].detach()
162 | 
163 |             sum_loss_value = 0.0
164 |             sum_loss_policy = 0.0
165 |             count_steps = 0
166 | 
167 |             for epoch in range(PPO_EPOCHES):
168 |                 for batch_ofs in range(0, len(trajectory), PPO_BATCH_SIZE):
169 |                     states_v = traj_states_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE]
170 |                     actions_v = traj_actions_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE]
171 |                     batch_adv_v = traj_adv_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE].unsqueeze(-1)
172 |                     batch_ref_v = traj_ref_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE]
173 |                     batch_old_logprob_v = old_logprob_v[batch_ofs:batch_ofs + PPO_BATCH_SIZE]
174 | 
175 |                     # critic training
176 |                     opt_crt.zero_grad()
177 |                     value_v = net_crt(states_v)
178 |                     loss_value_v = F.mse_loss(value_v.squeeze(-1), batch_ref_v)
179 |                     loss_value_v.backward()
180 |                     opt_crt.step()
181 | 
182 |                     # actor training
183 |                     opt_act.zero_grad()
184 |                     mu_v = net_act(states_v)
185 |                     logprob_pi_v = calc_logprob(mu_v, net_act.logstd, actions_v)
186 |                     ratio_v = torch.exp(logprob_pi_v - batch_old_logprob_v)
187 |                     surr_obj_v = batch_adv_v * ratio_v
188 |                     clipped_surr_v = batch_adv_v * torch.clamp(ratio_v, 1.0 - PPO_EPS, 1.0 + PPO_EPS)
189 |                     loss_policy_v = -torch.min(surr_obj_v, clipped_surr_v).mean()
190 |                     loss_policy_v.backward()
191 |                     opt_act.step()
192 | 
193 |                     sum_loss_value += loss_value_v.item()
194 |                     sum_loss_policy += loss_policy_v.item()
195 |                     count_steps += 1
196 | 
197 |             trajectory.clear()
198 |             writer.add_scalar("advantage", traj_adv_v.mean().item(), step_idx)
199 |             writer.add_scalar("values", traj_ref_v.mean().item(), step_idx)
200 |             writer.add_scalar("loss_policy", sum_loss_policy / count_steps, step_idx)
201 |             writer.add_scalar("loss_value", sum_loss_value / count_steps, step_idx)
202 | 


--------------------------------------------------------------------------------
/snake/self_play.py:
--------------------------------------------------------------------------------
 1 | from environment.environment import SnakeEnvironment
 2 | import random
 3 | 
 4 | env = SnakeEnvironment(draw=True, speed=100, rows=5, animation=False)
 5 | 
 6 | # env.play_human()
 7 | 
 8 | while True:
 9 | 	env.reset()
10 | 	terminal = False
11 | 	while not terminal:
12 | 	    action = random.randint(0, 4)
13 | 	    next_state, reward, is_done, _ = env.step(action)
14 | 	    terminal = is_done
15 | 


--------------------------------------------------------------------------------