├── .DS_Store
├── .gitattributes
├── 9781484251263.jpg
├── Contributing.md
├── LICENSE.txt
├── README.md
├── __init__.py
├── _vizdoom.ini
├── algorithms
    ├── __init__.py
    ├── __init__.pyc
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── actor_critic_utilities.cpython-36.pyc
    │   ├── dql_utilities.cpython-36.pyc
    │   ├── order_book_data.cpython-36.pyc
    │   └── trading.cpython-36.pyc
    ├── actor_critic_utilities.py
    ├── distributions.py
    ├── distributions.pyc
    ├── dql_utilities.py
    ├── dql_utilities.pyc
    ├── order_book_data.py
    ├── order_book_data.pyc
    ├── policy_gradient_utilities.py
    ├── sarsa_algorithm.py
    ├── trading.py
    └── trading.pyc
├── chapter1
    ├── __init__.py
    ├── __init__.pyc
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   └── open_ai_gym_example.cpython-36.pyc
    └── open_ai_gym_example.py
├── chapter2
    ├── .DS_Store
    ├── __init__.py
    ├── __init__.pyc
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   └── super_mario_example.cpython-36.pyc
    ├── cart_pole_example.py
    ├── cart_pole_example.pyc
    └── super_mario_example.py
├── chapter3
    ├── __init__.py
    ├── __init__.pyc
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── doom_example.cpython-36.pyc
    │   └── frozen_lake_example.cpython-36.pyc
    ├── basic.cfg
    ├── basic.wad
    ├── doom_example.py
    └── frozen_lake_example.py
├── chapter4
    ├── .DS_Store
    ├── __init__.py
    ├── __init__.pyc
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   └── market_making_example.cpython-36.pyc
    └── market_making_example.py
├── chapter5
    ├── .DS_Store
    ├── __init__.py
    ├── create_environment.py
    └── sonic_example.py
├── errata.md
├── neural_networks
    ├── Figure_1-1.png
    ├── __init__.py
    ├── __init__.pyc
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   └── models.cpython-36.pyc
    ├── gym_utilities.py
    ├── gym_utilities.pyc
    ├── market_making_models.py
    ├── market_making_models.pyc
    ├── models.py
    ├── models.pyc
    ├── policy_gradient_utilities.py
    ├── policy_gradient_utilities.pyc
    └── untitled4.py
└── requirements.txt


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/.DS_Store


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/9781484251263.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/9781484251263.jpg


--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Apress Source Code
 2 | 
 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | 1. Make sure you have a GitHub account.
 8 | 2. Fork the repository for the relevant book.
 9 | 3. Create a new branch on which to make your change, e.g. 
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 | 
14 | Thank you for your contribution!


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ﻿Freeware License, some rights reserved
 2 | 
 3 | Copyright (c) 2019 Taweh Beysolow
 4 | 
 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 
 6 | of this software and associated documentation files (the "Software"), 
 7 | to work with the Software within the limits of freeware distribution and fair use. 
 8 | This includes the rights to use, copy, and modify the Software for personal use. 
 9 | Users are also allowed and encouraged to submit corrections and modifications 
10 | to the Software for the benefit of other users.
11 | 
12 | It is not allowed to reuse,  modify, or redistribute the Software for 
13 | commercial use in any way, or for a user’s educational materials such as books 
14 | or blog articles without prior permission from the copyright holder. 
15 | 
16 | The above copyright notice and this permission notice need to be included 
17 | in all copies or substantial portions of the software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apress Source Code
 2 | 
 3 | This repository accompanies [*Applied Reinforcement Learning with Python*](https://www.apress.com/9781484251263) by Taweh Beysolow (Apress, 2019).
 4 | 
 5 | [comment]: #cover
 6 | ![Cover image](9781484251263.jpg)
 7 | 
 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
 9 | 
10 | ## Releases
11 | 
12 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
13 | 
14 | ## Contributions
15 | 
16 | See the file Contributing.md for more information on how you can contribute to this repository.


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | 


--------------------------------------------------------------------------------
/_vizdoom.ini:
--------------------------------------------------------------------------------
  1 | # This file was generated by ViZDoom 1.1.7 (ZDOOM 2.8.1) on Fri Mar 22 14:26:42 2019
  2 | 
  3 | # These are the directories to automatically search for IWADs.
  4 | # Each directory should be on a separate line, preceded by Path=
  5 | [IWADSearch.Directories]
  6 | Path=.
  7 | Path=$DOOMWADDIR
  8 | Path=/Users/tawehbeysolow/Documents/_vizdoom
  9 | Path=/Users/tawehbeysolow/Library/Application Support/_vizdoom
 10 | Path=$PROGDIR
 11 | Path=/Library/Application Support/_vizdoom
 12 | 
 13 | # These are the directories to search for wads added with the -file
 14 | # command line parameter, if they cannot be found with the path
 15 | # as-is. Layout is the same as for IWADSearch.Directories
 16 | [FileSearch.Directories]
 17 | Path=$PROGDIR
 18 | Path=/Library/Application Support/_vizdoom
 19 | Path=$DOOMWADDIR
 20 | 
 21 | # Files to automatically execute when running the corresponding game.
 22 | # Each file should be on its own line, preceded by Path=
 23 | 
 24 | [Doom.AutoExec]
 25 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg
 26 | 
 27 | [Heretic.AutoExec]
 28 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg
 29 | 
 30 | [Hexen.AutoExec]
 31 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg
 32 | 
 33 | [Strife.AutoExec]
 34 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg
 35 | 
 36 | [Chex.AutoExec]
 37 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg
 38 | 
 39 | # WAD files to always load. These are loaded after the IWAD but before
 40 | # any files added with -file. Place each file on its own line, preceded
 41 | # by Path=
 42 | [Global.Autoload]
 43 | 
 44 | # Wad files to automatically load depending on the game and IWAD you are
 45 | # playing.  You may have have files that are loaded for all similar IWADs
 46 | # (the game) and files that are only loaded for particular IWADs. For example,
 47 | # any files listed under 'doom.Autoload' will be loaded for any version of Doom,
 48 | # but files listed under 'doom.doom2.Autoload' will only load when you are
 49 | # playing a Doom 2 based game (doom2.wad, tnt.wad or plutonia.wad), and files listed under
 50 | # 'doom.doom2.commercial.Autoload' only when playing doom2.wad.
 51 | 
 52 | [doom.Autoload]
 53 | 
 54 | [doom.doom2.Autoload]
 55 | 
 56 | [doom.doom2.commercial.Autoload]
 57 | 
 58 | [doom.doom2.bfg.Autoload]
 59 | 
 60 | [doom.doom2.plutonia.Autoload]
 61 | 
 62 | [doom.doom2.tnt.Autoload]
 63 | 
 64 | [doom.doom1.Autoload]
 65 | 
 66 | [doom.doom1.registered.Autoload]
 67 | 
 68 | [doom.doom1.ultimate.Autoload]
 69 | 
 70 | [doom.doom1.bfg.Autoload]
 71 | 
 72 | [doom.freedoom.Autoload]
 73 | 
 74 | [doom.freedoom.demo.Autoload]
 75 | 
 76 | [doom.freedoom.phase1.Autoload]
 77 | 
 78 | [doom.freedoom.phase2.Autoload]
 79 | 
 80 | [doom.freedoom.freedm.Autoload]
 81 | 
 82 | [heretic.Autoload]
 83 | 
 84 | [heretic.heretic.Autoload]
 85 | 
 86 | [heretic.shadow.Autoload]
 87 | 
 88 | [blasphemer.Autoload]
 89 | 
 90 | [hexen.Autoload]
 91 | 
 92 | [hexen.deathkings.Autoload]
 93 | 
 94 | [hexen.hexen.Autoload]
 95 | 
 96 | [strife.Autoload]
 97 | 
 98 | [chex.Autoload]
 99 | 
100 | [chex.chex1.Autoload]
101 | 
102 | [chex.chex3.Autoload]
103 | 
104 | [urbanbrawl.Autoload]
105 | 
106 | [hacx.Autoload]
107 | 
108 | [hacx.hacx1.Autoload]
109 | 
110 | [hacx.hacx2.Autoload]
111 | 
112 | [harmony.Autoload]
113 | 
114 | [square.Autoload]
115 | 
116 | [square.squareware.Autoload]
117 | 
118 | [square.square.Autoload]
119 | 
120 | [LastRun]
121 | Version=211
122 | 
123 | [GlobalSettings]
124 | gus_memsize=0
125 | midi_dmxgus=true
126 | gus_patchdir=
127 | midi_voices=32
128 | midi_config=timidity.cfg
129 | snd_efx=true
130 | snd_aldevice=Default
131 | wildmidi_enhanced_resampling=true
132 | wildmidi_reverb=false
133 | wildmidi_frequency=0
134 | wildmidi_config=
135 | fluid_chorus_type=0
136 | fluid_chorus_depth=8
137 | fluid_chorus_speed=0.3
138 | fluid_chorus_level=1
139 | fluid_chorus_voices=3
140 | fluid_reverb_level=0.57
141 | fluid_reverb_width=0.76
142 | fluid_reverb_damping=0.23
143 | fluid_reverb_roomsize=0.61
144 | fluid_threads=1
145 | fluid_samplerate=0
146 | fluid_interp=1
147 | fluid_voices=128
148 | fluid_chorus=true
149 | fluid_reverb=true
150 | fluid_gain=0.5
151 | fluid_patchset=
152 | opl_core=0
153 | opl_numchips=2
154 | timidity_frequency=44100
155 | timidity_pipe=90
156 | timidity_mastervolume=1
157 | timidity_byteswap=false
158 | timidity_8bit=false
159 | timidity_stereo=true
160 | timidity_reverb=0
161 | timidity_chorus=0
162 | timidity_extargs=
163 | timidity_exe=timidity
164 | snd_mididevice=-1
165 | spc_amp=1.875
166 | mod_dumb_mastervolume=1
167 | mod_autochip_scan_threshold=12
168 | mod_autochip_size_scan=500
169 | mod_autochip_size_force=100
170 | mod_autochip=false
171 | mod_interp=2
172 | mod_volramp=2
173 | mod_samplerate=0
174 | mod_dumb=true
175 | snd_sfxvolume=1
176 | snd_backend=openal
177 | snd_output=default
178 | snd_buffersize=0
179 | snd_samplerate=0
180 | snd_musicvolume=0.5
181 | snd_waterlp=250
182 | snd_midipatchset=
183 | snd_output_format=PCM-16
184 | snd_speakermode=Auto
185 | snd_resampler=Linear
186 | snd_waterreverb=true
187 | snd_hrtf=false
188 | snd_buffercount=0
189 | snd_driver=0
190 | opl_fullpan=true
191 | vid_tft=true
192 | m_showinputgrid=false
193 | m_show_backbutton=0
194 | m_use_mouse=1
195 | show_messages=true
196 | mouse_sensitivity=1
197 | map_point_coordinates=true
198 | vid_aspect=3
199 | vid_nowidescreen=false
200 | vid_refreshrate=0
201 | vid_vsync=false
202 | vid_defbits=8
203 | vid_defheight=480
204 | vid_defwidth=640
205 | Gamma=1
206 | statfile=zdoomstat.txt
207 | savestatistics=0
208 | snd_flipstereo=false
209 | snd_channels=32
210 | r_columnmethod=1
211 | r_quakeintensity=1
212 | cl_predict_lerpthreshold=2
213 | cl_predict_lerpscale=0.05
214 | cl_predict_specials=true
215 | cl_noprediction=false
216 | telezoom=true
217 | r_fakecontrast=1
218 | chase_dist=90
219 | chase_height=-8
220 | gl_cachetime=0.6
221 | gl_cachenodes=true
222 | nomonsterinterpolation=false
223 | png_gamma=0
224 | png_level=5
225 | screenshot_dir=
226 | screenshot_type=png
227 | screenshot_quiet=false
228 | use_joystick=false
229 | autosavecount=4
230 | disableautosave=0
231 | autosavenum=0
232 | smooth_mouse=false
233 | m_side=2
234 | m_forward=1
235 | m_yaw=1
236 | m_pitch=1
237 | lookstrafe=false
238 | freelook=false
239 | invertmouse=false
240 | cl_run=false
241 | demo_compress=true
242 | cl_waitforsave=true
243 | save_dir=
244 | longsavemessages=true
245 | storesavepic=true
246 | nofilecompression=false
247 | cl_capfps=true
248 | defaultiwad=
249 | queryiwad=true
250 | con_ctrl_d=
251 | con_buffersize=-1
252 | osx_additional_parameters=
253 | showendoom=0
254 | bgamma=1
255 | ggamma=1
256 | rgamma=1
257 | vid_forcesurface=false
258 | vid_displaybits=32
259 | vid_adapter=0
260 | mouse_capturemode=1
261 | m_filter=false
262 | m_noprescale=false
263 | use_mouse=false
264 | vid_winscale=1
265 | fullscreen=false
266 | vid_maxfps=200
267 | 
268 | [GlobalSettings.Unknown]
269 | 
270 | [Doom.Player]
271 | wi_noautostartmap=false
272 | playerclass=Fighter
273 | stillbob=0
274 | movebob=0.25
275 | neverswitchonpickup=false
276 | gender=male
277 | team=255
278 | skin=base
279 | colorset=0
280 | color=40 cf 00
281 | name=Player
282 | autoaim=35
283 | 
284 | [Doom.ConsoleVariables]
285 | r_drawfuzz=1
286 | vid_nopalsubstitutions=false
287 | snd_pitched=false
288 | menu_screenratios=-1
289 | snd_menuvolume=0.6
290 | show_obituaries=true
291 | am_showmaplabel=2
292 | crosshairgrow=false
293 | crosshairscale=false
294 | crosshairhealth=true
295 | crosshaircolor=ff 00 00
296 | crosshairforce=false
297 | crosshair=0
298 | st_scale=true
299 | paletteflash=0
300 | hudcolor_stats=3
301 | hudcolor_statnames=6
302 | hudcolor_xyco=3
303 | hudcolor_ttim=5
304 | hudcolor_ltim=8
305 | hudcolor_time=6
306 | hudcolor_titl=10
307 | hud_berserk_health=true
308 | hud_armor_green=100
309 | hud_armor_yellow=50
310 | hud_armor_red=25
311 | hud_health_green=100
312 | hud_health_yellow=50
313 | hud_health_red=25
314 | hud_ammo_yellow=50
315 | hud_ammo_red=25
316 | hud_showlag=0
317 | hud_timecolor=5
318 | hud_showtime=0
319 | hud_showammo=2
320 | hud_showweapons=true
321 | hud_showscore=false
322 | hud_showstats=false
323 | hud_showitems=false
324 | hud_showmonsters=true
325 | hud_showsecrets=true
326 | hud_althud=false
327 | hud_althudscale=2
328 | st_oldouch=false
329 | cl_maxdecals=1024
330 | cl_spreaddecals=true
331 | transsouls=0.75
332 | wi_showtotaltime=true
333 | wi_percents=true
334 | dimcolor=ff d7 00
335 | dimamount=-1
336 | hud_scale=true
337 | allcheats=false
338 | r_stretchsky=true
339 | r_shadercolormaps=true
340 | screenblocks=10
341 | r_deathcamera=false
342 | cl_showsecretmessage=true
343 | cl_bloodtype=1
344 | cl_pufftype=0
345 | addrocketexplosion=false
346 | cl_missiledecals=true
347 | cl_doautoaim=false
348 | cl_bloodsplats=true
349 | cl_showmultikills=false
350 | cl_showsprees=false
351 | r_maxparticles=4092
352 | r_rail_trailsparsity=1
353 | r_rail_spiralsparsity=1
354 | r_rail_smartspiral=false
355 | cl_rockettrails=3
356 | dlg_musicvolume=1
357 | sb_teamdeathmatch_headingcolor=6
358 | sb_teamdeathmatch_enable=true
359 | sb_deathmatch_otherplayercolor=2
360 | sb_deathmatch_yourplayercolor=3
361 | sb_deathmatch_headingcolor=6
362 | sb_deathmatch_enable=true
363 | sb_cooperative_otherplayercolor=2
364 | sb_cooperative_yourplayercolor=3
365 | sb_cooperative_headingcolor=6
366 | sb_cooperative_enable=true
367 | nametagcolor=5
368 | displaynametags=0
369 | language=auto
370 | compatmode=0
371 | vid_cursor=None
372 | wipetype=0
373 | dehload=0
374 | chat_substitution=false
375 | chatmacro0=No
376 | chatmacro9=Yes
377 | chatmacro8=I'll take care of it.
378 | chatmacro7=Come here!
379 | chatmacro6=Next time, scumbag...
380 | chatmacro5=You suck!
381 | chatmacro4=Help!
382 | chatmacro3=I'm not looking too good!
383 | chatmacro2=I'm OK.
384 | chatmacro1=I'm ready to kick butt!
385 | lookspring=true
386 | con_midtime=0
387 | msgmidcolor2=4
388 | msgmidcolor=5
389 | msg4color=3
390 | msg3color=3
391 | msg2color=2
392 | msg1color=5
393 | msg0color=6
394 | msg=0
395 | con_alpha=0.75
396 | con_scaletext=0
397 | con_centernotify=false
398 | con_notifytime=0
399 | con_notablist=false
400 | cl_bbannounce=false
401 | am_followplayer=true
402 | am_textured=true
403 | am_ovthingcolor_citem=e8 88 00
404 | am_ovthingcolor_item=e8 88 00
405 | am_ovthingcolor_ncmonster=e8 88 00
406 | am_ovthingcolor_monster=e8 88 00
407 | am_ovthingcolor_friend=e8 88 00
408 | am_ovthingcolor=e8 88 00
409 | am_ovsecretsectorcolor=00 ff ff
410 | am_ovinterlevelcolor=ff ff 00
411 | am_ovtelecolor=ff ff 00
412 | am_ovunseencolor=00 22 6e
413 | am_ovcdwallcolor=00 88 44
414 | am_ovfdwallcolor=00 88 44
415 | am_ovefwallcolor=00 88 44
416 | am_ovlockedcolor=00 88 44
417 | am_ovotherwallscolor=00 88 44
418 | am_ovspecialwallcolor=ff ff ff
419 | am_ovsecretwallcolor=00 88 44
420 | am_ovwallcolor=00 ff 00
421 | am_ovyourcolor=fc e8 d8
422 | am_thingcolor_citem=fc fc fc
423 | am_thingcolor_item=fc fc fc
424 | am_thingcolor_ncmonster=fc fc fc
425 | am_thingcolor_monster=fc fc fc
426 | am_thingcolor_friend=fc fc fc
427 | am_secretsectorcolor=ff 00 ff
428 | am_interlevelcolor=ff 00 00
429 | am_intralevelcolor=00 00 ff
430 | am_lockedcolor=00 78 00
431 | am_notseencolor=6c 6c 6c
432 | am_xhaircolor=80 80 80
433 | am_gridcolor=8b 5a 2b
434 | am_thingcolor=fc fc fc
435 | am_efwallcolor=66 55 55
436 | am_cdwallcolor=4c 38 20
437 | am_fdwallcolor=88 70 58
438 | am_tswallcolor=88 88 88
439 | am_specialwallcolor=ff ff ff
440 | am_secretwallcolor=00 00 00
441 | am_wallcolor=2c 18 08
442 | am_yourcolor=fc e8 d8
443 | am_backcolor=6c 54 40
444 | am_showthingsprites=0
445 | am_showtriggerlines=true
446 | am_showkeys=true
447 | am_drawmapback=0
448 | am_map_secrets=1
449 | am_customcolors=true
450 | am_colorset=0
451 | am_showtotaltime=false
452 | am_showtime=false
453 | am_showitems=false
454 | am_showmonsters=false
455 | am_showsecrets=false
456 | am_overlay=0
457 | am_rotate=0
458 | 
459 | [Doom.LocalServerInfo]
460 | sv_corpsequeuesize=64
461 | forcewater=false
462 | sv_smartaim=0
463 | sv_disableautohealth=false
464 | sv_dropstyle=0
465 | compatflags2=0
466 | compatflags=0
467 | 
468 | [Doom.UnknownConsoleVariables]
469 | 
470 | [Doom.ConsoleAliases]
471 | 
472 | [Doom.Bindings]
473 | 1=slot 1
474 | 2=slot 2
475 | 3=slot 3
476 | 4=slot 4
477 | 5=slot 5
478 | 6=slot 6
479 | 7=slot 7
480 | 8=slot 8
481 | 9=slot 9
482 | 0=slot 0
483 | -=sizedown
484 | Equals=sizeup
485 | tab=togglemap
486 | t=messagemode
487 | LeftBracket=invprev
488 | RightBracket=invnext
489 | enter=invuse
490 | ctrl=+attack
491 | `=toggleconsole
492 | shift=+speed
493 | \=+showscores
494 | ,=+moveleft
495 | .=+moveright
496 | alt=+strafe
497 | space=+use
498 | capslock=toggle cl_run
499 | f1=menu_help
500 | f2=menu_save
501 | f3=menu_load
502 | f4=menu_options
503 | f5=menu_display
504 | f6=quicksave
505 | f7=menu_endgame
506 | f8=togglemessages
507 | f9=quickload
508 | f10=menu_quit
509 | f11=bumpgamma
510 | f12=spynext
511 | sysrq=screenshot
512 | pause=pause
513 | home=land
514 | uparrow=+forward
515 | pgup=+moveup
516 | leftarrow=+left
517 | rightarrow=+right
518 | end=centerview
519 | downarrow=+back
520 | pgdn=+lookup
521 | ins=+movedown
522 | del=+lookdown
523 | mouse1=+attack
524 | mouse2=+strafe
525 | mouse3=+forward
526 | mouse4=+speed
527 | joy1=+attack
528 | joy2=+strafe
529 | joy3=+speed
530 | joy4=+use
531 | mwheelup=weapprev
532 | mwheeldown=weapnext
533 | mwheelright=invnext
534 | mwheelleft=invprev
535 | dpadup=togglemap
536 | dpaddown=invuse
537 | dpadleft=invprev
538 | dpadright=invnext
539 | pad_start=pause
540 | pad_back=menu_main
541 | lthumb=crouch
542 | lshoulder=weapprev
543 | rshoulder=weapnext
544 | ltrigger=+altattack
545 | rtrigger=+attack
546 | pad_a=+use
547 | pad_y=+jump
548 | 
549 | [Doom.DoubleBindings]
550 | 
551 | [Doom.AutomapBindings]
552 | 0=am_gobig
553 | -=+am_zoomout
554 | Equals=+am_zoomin
555 | p=am_toggletexture
556 | f=am_togglefollow
557 | g=am_togglegrid
558 | c=am_clearmarks
559 | m=am_setmark
560 | kp-=+am_zoomout
561 | kp+=+am_zoomin
562 | uparrow=+am_panup
563 | leftarrow=+am_panleft
564 | rightarrow=+am_panright
565 | downarrow=+am_pandown
566 | mwheelup=am_zoom 1.2
567 | mwheeldown=am_zoom -1.2
568 | 
569 | 


--------------------------------------------------------------------------------
/algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__init__.py


--------------------------------------------------------------------------------
/algorithms/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__init__.pyc


--------------------------------------------------------------------------------
/algorithms/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/algorithms/__pycache__/actor_critic_utilities.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/actor_critic_utilities.cpython-36.pyc


--------------------------------------------------------------------------------
/algorithms/__pycache__/dql_utilities.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/dql_utilities.cpython-36.pyc


--------------------------------------------------------------------------------
/algorithms/__pycache__/order_book_data.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/order_book_data.cpython-36.pyc


--------------------------------------------------------------------------------
/algorithms/__pycache__/trading.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/trading.cpython-36.pyc


--------------------------------------------------------------------------------
/algorithms/actor_critic_utilities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Mar 16 06:54:29 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import time, tensorflow as tf, numpy as np
 10 | from baselines.common.runners import AbstractEnvRunner
 11 | from baselines.common import explained_variance
 12 | 
 13 | def mse(pred, target):
 14 |     return tf.square(pred-target)/2.
 15 | 
 16 | def find_trainable_variables(key):
 17 |     with tf.variable_scope(key):
 18 |         return tf.trainable_variables()
 19 |     
 20 | def swap_flatten_axes(array):
 21 |     return arrary.swapaxes(0, 1).reshape(array.shape[0] * array.shape[1], * array.shape[2:])
 22 | 
 23 | class Model(object):
 24 |     
 25 |     def __init__(self, session, policy_model, observation_space, action_space, n_environments,
 26 |                  n_steps, entropy_coefficient, value_coefficient, max_grad_norm):
 27 |         
 28 |         session.run(tf.global_variables_initializer())
 29 |         actions_ = tf.placeholder(tf.int32, [None], name='actions')
 30 |         advantages_ = tf.placeholder(tf.float32, [None], name='advantages')
 31 |         rewards_ = tf.placeholder(tf.float32, [None], name='rewards')
 32 |         learning_rate = tf.placeholder(tf.float32, name='learning_rate')
 33 |         step_model = policy_model(session, observation_space, action_space, n_environments, 1, reuse=False)
 34 |         train_model = policy_model(session, observation_space, action_space, n_environments*n_steps, n_steps, reuse=tf.AUTO_REUSE)
 35 |         
 36 |         error_rate = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.logits, labels=actions_)
 37 |         mean_squared_error = tf.reduce_mean(advantages_ * error_rate)
 38 | 
 39 |         value_loss = tf.reduce_mean(mse(tf.squeeze(train_model.value_function),rewards_))
 40 |         entropy = tf.reduce_mean(train_model.distribution.entropy())
 41 |         loss = mean_squared_error - entropy * entropy_coefficient + value_loss * value_coefficient
 42 | 
 43 |         params = find_trainable_variables('model')
 44 |         gradients = tf.gradients(loss, params)
 45 |         if max_grad_norm is not None:
 46 |             gradients, grad_norm = tf.clip_by_global_norm(gradients, max_grad_norm)
 47 |             
 48 |         gradients = list(zip(gradients, params))
 49 |         trainer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.99, epsilon=1e-5)
 50 |         _train = trainer.apply_gradients(gradients)
 51 | 
 52 |         def train(states_in, actions, returns, values, learning_rate):
 53 |             advantages = returns - values
 54 | 
 55 |             dictionary = {train_model.inputs_: states_in,
 56 |                          actions_: actions,
 57 |                          advantages_: advantages,
 58 |                          rewards_: returns,
 59 |                          learning_rate: learning_rate}
 60 |             
 61 |             with tf.Session() as session:
 62 |                 _policy_loss, _value_loss, _policy_entropy, _= session.run([mean_squared_error, 
 63 |                                                                             value_loss, 
 64 |                                                                             entropy, 
 65 |                                                                             _train], dictionary)
 66 |             return _policy_loss, _value_loss, _policy_entropy
 67 | 
 68 |         def save(save_path):
 69 |             saver = tf.train.Saver()
 70 |             saver.save(session, save_path)
 71 | 
 72 |         def load(load_path):
 73 |             saver = tf.train.Saver()
 74 |             print('Loading ' + load_path)
 75 |             saver.restore(session, load_path)
 76 | 
 77 |         self.train = train
 78 |         self.train_model = train_model
 79 |         self.step_model = step_model
 80 |         self.step = step_model.step
 81 |         self.value = step_model.value
 82 |         self.initial_state = step_model.initial_state
 83 |         self.save = save
 84 |         self.load = load
 85 |         tf.global_variables_initializer().run(session=tf.Session())
 86 | 
 87 | class ModelTrainer(AbstractEnvRunner):
 88 |     
 89 |     def __init__(self, environment, model, n_steps, n_timesteps, gamma, _lambda):
 90 |         self.environment = environment
 91 |         self.model = model
 92 |         self.n_steps = n_steps
 93 |         self.gamma = gamma
 94 |         self._lambda = _lambda
 95 |         self.n_timesteps = n_timesteps
 96 |         self.observations = environment.reset()
 97 |         self.dones = False
 98 | 
 99 |     def step(self):
100 |         
101 |         _observations, _actions, _rewards, _values, _dones = [],[],[],[],[]
102 | 
103 |         for _ in range(self.n_steps):
104 |             actions, values = self.model.step(self.observations, self.dones)
105 |             _observations.append(np.copy(self.observations)) 
106 |             _actions.append(actions)
107 |             _values.append(values)
108 |             _dones.append(self.dones)
109 |             if self.dones: self.environment.reset()
110 |             
111 |             for action in actions:
112 |                 self.environment.render()
113 |                 self.observations[:], rewards, self.dones, _ = self.environment.step(action)
114 |                 _rewards.append(rewards)
115 | 
116 |         #batch of steps to batch of rollouts
117 |         _observations = np.asarray(_observations, dtype=np.uint8)
118 |         _rewards = np.asarray(_rewards, dtype=np.float32)
119 |         _actions = np.asarray(_actions, dtype=np.int32)
120 |         _values = np.asarray(_values, dtype=np.float32)
121 |         _dones = np.asarray(_dones, dtype=np.bool)
122 |         last_values = self.model.value(self.observations)
123 |         _returns = np.zeros_like(_rewards)
124 |         _advantages = np.zeros_like(_rewards)
125 |         last_lambda = 0
126 | 
127 |         for t in reversed(range(self.n_steps)):
128 |             if t == self.nsteps - 1:
129 |                 next_nonterminal = 1.0 - self.dones
130 |                 next_values = last_values
131 |             else:
132 |                 next_nonterminal = 1.0 - _dones[t+1]
133 |                 next_values = _values[t+1]
134 | 
135 |             delta = _rewards[t] + self.gamma * nextvalues * nextnonterminal - _values[t]
136 |             _advantages[t] = last_lambda = delta + self.gamma * self._lambda * nextnonterminal * last_lambda
137 | 
138 |         _returns = _advantages + _values
139 |         return map(swap_flatten_axes, (_observations, _actions, _returns, _values))
140 | 
141 | 
142 | def train_model(policy_model, environment, n_steps, max_steps, gamma, _lambda,
143 |                 value_coefficient, entropy_coefficient, learning_rate, max_grad_norm, log_interval):
144 | 
145 |     n_epochs = 4
146 |     n_batches = 8
147 |     n_environments = 1 #environment.num_envs
148 |     observation_space = environment.observation_space
149 |     action_space = environment.action_space
150 |     batch_size = n_environments * n_steps 
151 |     batch_train_size = batch_size // n_batches
152 |     assert batch_size % n_batches == 0
153 |     session = tf.Session()
154 | 
155 |     model = Model(session=session,
156 |                       policy_model=policy_model,
157 |                       observation_space=observation_space,
158 |                       action_space=action_space,
159 |                       n_environments=1,
160 |                       n_steps=1,
161 |                       entropy_coefficient=0,
162 |                       value_coefficient=0,
163 |                       max_grad_norm=0)
164 |  
165 |     model_trainer = ModelTrainer(environment=environment, 
166 |                             model=model, 
167 |                             n_steps=n_steps, 
168 |                             n_timesteps=max_steps, 
169 |                             gamma=gamma, 
170 |                             _lambda=_lambda)
171 | 
172 |     initial_start_time = time.time()
173 | 
174 | 
175 |     for update in range(1, max_steps//batch_size+1):
176 |         
177 |         timer_start = time.time()
178 |         observations, actions, returns, values = model_trainer.step()
179 |         mb_losses = []
180 |         total_batches_train = 0
181 |         indices = np.arange(batch_size)
182 | 
183 |         for _ in range(n_epochs):
184 |             np.random.shuffle(indices)
185 |             for start in range(0, batch_size, batch_train_size):
186 |                 end = start + batch_train_size
187 |                 mbinds = indices[start:end]
188 |                 slices = (arr[mbinds] for arr in (obs, actions, returns, values))
189 |                 mb_losses.append(model.train(*slices, lr))
190 | 
191 |         loss = np.mean(mb_losses, axis=0)
192 |         frames_per_second = int(batch_size / (time.time() - initial_start_time))
193 | 
194 |         if update % log_interval == 0 or update == 1:
195 |             
196 |             """
197 |             Computes fraction of variance that ypred explains about y.
198 |             Returns 1 - Var[y-ypred] / Var[y]
199 |             interpretation:
200 |             explained_variance = 0  =>  might as well have predicted zero
201 |             explained_variance = 1  =>  perfect prediction
202 |             explained_variance < 0  =>  worse than just predicting zero
203 |             """
204 |             _explained_variance = explained_variance(values, returns)
205 |             logger.record_tabular("nupdates", update)
206 |             logger.record_tabular("total_timesteps", update*batch_size)
207 |             logger.record_tabular("fps", frames_per_second)
208 |             logger.record_tabular("policy_loss", float(loss[0]))
209 |             logger.record_tabular("policy_entropy", float(loss[2]))
210 |             logger.record_tabular("value_loss", float(loss[1]))
211 |             logger.record_tabular("explained_variance", float(_explained_variance))
212 |             logger.record_tabular("time elapsed", float(time.time() - initial_start_time))
213 |             logger.dump_tabular()
214 | 
215 |             savepath = "./models/" + str(update) + "/model.ckpt"
216 |             model.save(savepath)
217 |             print('Saving to', savepath)
218 |             
219 |     environment.close()
220 |     return model
221 | 


--------------------------------------------------------------------------------
/algorithms/distributions.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Jun 18 15:41:31 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import tensorflow as tf
 10 | import numpy as np
 11 | from baselines.a2c.utils import fc
 12 | from tensorflow.python.ops import math_ops
 13 | #import baselines.common.tf_util as U
 14 | 
 15 | class Pd(object):
 16 |     """
 17 |     A particular probability distribution
 18 |     """
 19 |     def flatparam(self):
 20 |         raise NotImplementedError
 21 |     def mode(self):
 22 |         raise NotImplementedError
 23 |     def neglogp(self, x):
 24 |         # Usually it's easier to define the negative logprob
 25 |         raise NotImplementedError
 26 |     def kl(self, other):
 27 |         raise NotImplementedError
 28 |     def entropy(self):
 29 |         raise NotImplementedError
 30 |     def sample(self):
 31 |         raise NotImplementedError
 32 |     def logp(self, x):
 33 |         return - self.neglogp(x)
 34 |     def get_shape(self):
 35 |         return self.flatparam().shape
 36 |     @property
 37 |     def shape(self):
 38 |         return self.get_shape()
 39 |     def __getitem__(self, idx):
 40 |         return self.__class__(self.flatparam()[idx])
 41 | 
 42 | class PdType(object):
 43 |     """
 44 |     Parametrized family of probability distributions
 45 |     """
 46 |     def pdclass(self):
 47 |         raise NotImplementedError
 48 |     def pdfromflat(self, flat):
 49 |         return self.pdclass()(flat)
 50 |     def pdfromlatent(self, latent_vector, init_scale, init_bias):
 51 |         raise NotImplementedError
 52 |     def param_shape(self):
 53 |         raise NotImplementedError
 54 |     def sample_shape(self):
 55 |         raise NotImplementedError
 56 |     def sample_dtype(self):
 57 |         raise NotImplementedError
 58 | 
 59 |     def param_placeholder(self, prepend_shape, name=None):
 60 |         return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
 61 |     def sample_placeholder(self, prepend_shape, name=None):
 62 |         return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
 63 | 
 64 |     def __eq__(self, other):
 65 |         return (type(self) == type(other)) and (self.__dict__ == other.__dict__)
 66 | 
 67 | class CategoricalPdType(PdType):
 68 |     def __init__(self, ncat):
 69 |         self.ncat = ncat
 70 |     def pdclass(self):
 71 |         return CategoricalPd
 72 |     def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
 73 |         pdparam = _matching_fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
 74 |         return self.pdfromflat(pdparam), pdparam
 75 | 
 76 |     def param_shape(self):
 77 |         return [self.ncat]
 78 |     def sample_shape(self):
 79 |         return []
 80 |     def sample_dtype(self):
 81 |         return tf.int32
 82 | 
 83 | 
 84 | class MultiCategoricalPdType(PdType):
 85 |     def __init__(self, nvec):
 86 |         self.ncats = nvec.astype('int32')
 87 |         assert (self.ncats > 0).all()
 88 |     def pdclass(self):
 89 |         return MultiCategoricalPd
 90 |     def pdfromflat(self, flat):
 91 |         return MultiCategoricalPd(self.ncats, flat)
 92 | 
 93 |     def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0):
 94 |         pdparam = _matching_fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
 95 |         return self.pdfromflat(pdparam), pdparam
 96 | 
 97 |     def param_shape(self):
 98 |         return [sum(self.ncats)]
 99 |     def sample_shape(self):
100 |         return [len(self.ncats)]
101 |     def sample_dtype(self):
102 |         return tf.int32
103 | 
104 | class DiagGaussianPdType(PdType):
105 |     def __init__(self, size):
106 |         self.size = size
107 |     def pdclass(self):
108 |         return DiagGaussianPd
109 | 
110 |     def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
111 |         mean = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
112 |         logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
113 |         pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
114 |         return self.pdfromflat(pdparam), mean
115 | 
116 |     def param_shape(self):
117 |         return [2*self.size]
118 |     def sample_shape(self):
119 |         return [self.size]
120 |     def sample_dtype(self):
121 |         return tf.float32
122 | 
123 | class BernoulliPdType(PdType):
124 |     def __init__(self, size):
125 |         self.size = size
126 |     def pdclass(self):
127 |         return BernoulliPd
128 |     def param_shape(self):
129 |         return [self.size]
130 |     def sample_shape(self):
131 |         return [self.size]
132 |     def sample_dtype(self):
133 |         return tf.int32
134 |     def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
135 |         pdparam = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
136 |         return self.pdfromflat(pdparam), pdparam
137 | 
138 | # WRONG SECOND DERIVATIVES
139 | # class CategoricalPd(Pd):
140 | #     def __init__(self, logits):
141 | #         self.logits = logits
142 | #         self.ps = tf.nn.softmax(logits)
143 | #     @classmethod
144 | #     def fromflat(cls, flat):
145 | #         return cls(flat)
146 | #     def flatparam(self):
147 | #         return self.logits
148 | #     def mode(self):
149 | #         return U.argmax(self.logits, axis=-1)
150 | #     def logp(self, x):
151 | #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
152 | #     def kl(self, other):
153 | #         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
154 | #                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
155 | #     def entropy(self):
156 | #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
157 | #     def sample(self):
158 | #         u = tf.random_uniform(tf.shape(self.logits))
159 | #         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
160 | 
161 | class CategoricalPd(Pd):
162 |     def __init__(self, logits):
163 |         self.logits = logits
164 |     def flatparam(self):
165 |         return self.logits
166 |     def mode(self):
167 |         return tf.argmax(self.logits, axis=-1)
168 | 
169 |     @property
170 |     def mean(self):
171 |         return tf.nn.softmax(self.logits)
172 |     def neglogp(self, x):
173 |         # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
174 |         # Note: we can't use sparse_softmax_cross_entropy_with_logits because
175 |         #       the implementation does not allow second-order derivatives...
176 |         if x.dtype in {tf.uint8, tf.int32, tf.int64}:
177 |             # one-hot encoding
178 |             x_shape_list = x.shape.as_list()
179 |             logits_shape_list = self.logits.get_shape().as_list()[:-1]
180 |             for xs, ls in zip(x_shape_list, logits_shape_list):
181 |                 if xs is not None and ls is not None:
182 |                     assert xs == ls, 'shape mismatch: {} in x vs {} in logits'.format(xs, ls)
183 | 
184 |             x = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
185 |         else:
186 |             # already encoded
187 |             assert x.shape.as_list() == self.logits.shape.as_list()
188 | 
189 |         return tf.nn.softmax_cross_entropy_with_logits_v2(
190 |             logits=self.logits,
191 |             labels=x)
192 |     def kl(self, other):
193 |         a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
194 |         a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True)
195 |         ea0 = tf.exp(a0)
196 |         ea1 = tf.exp(a1)
197 |         z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
198 |         z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True)
199 |         p0 = ea0 / z0
200 |         return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
201 |     def entropy(self):
202 |         a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
203 |         ea0 = tf.exp(a0)
204 |         z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
205 |         p0 = ea0 / z0
206 |         return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
207 |     def sample(self):
208 |         u = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype)
209 |         return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
210 |     @classmethod
211 |     def fromflat(cls, flat):
212 |         return cls(flat)
213 | 
214 | class MultiCategoricalPd(Pd):
215 |     def __init__(self, nvec, flat):
216 |         self.flat = flat
217 |         self.categoricals = list(map(CategoricalPd,
218 |             tf.split(flat, np.array(nvec, dtype=np.int32), axis=-1)))
219 |     def flatparam(self):
220 |         return self.flat
221 |     def mode(self):
222 |         return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
223 |     def neglogp(self, x):
224 |         return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
225 |     def kl(self, other):
226 |         return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
227 |     def entropy(self):
228 |         return tf.add_n([p.entropy() for p in self.categoricals])
229 |     def sample(self):
230 |         return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
231 |     @classmethod
232 |     def fromflat(cls, flat):
233 |         raise NotImplementedError
234 | 
235 | class DiagGaussianPd(Pd):
236 |     def __init__(self, flat):
237 |         self.flat = flat
238 |         mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
239 |         self.mean = mean
240 |         self.logstd = logstd
241 |         self.std = tf.exp(logstd)
242 |     def flatparam(self):
243 |         return self.flat
244 |     def mode(self):
245 |         return self.mean
246 |     def neglogp(self, x):
247 |         return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
248 |                + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
249 |                + tf.reduce_sum(self.logstd, axis=-1)
250 |     def kl(self, other):
251 |         assert isinstance(other, DiagGaussianPd)
252 |         return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
253 |     def entropy(self):
254 |         return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
255 |     def sample(self):
256 |         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
257 |     @classmethod
258 |     def fromflat(cls, flat):
259 |         return cls(flat)
260 | 
261 | 
262 | class BernoulliPd(Pd):
263 |     def __init__(self, logits):
264 |         self.logits = logits
265 |         self.ps = tf.sigmoid(logits)
266 |     def flatparam(self):
267 |         return self.logits
268 |     @property
269 |     def mean(self):
270 |         return self.ps
271 |     def mode(self):
272 |         return tf.round(self.ps)
273 |     def neglogp(self, x):
274 |         return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
275 |     def kl(self, other):
276 |         return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
277 |     def entropy(self):
278 |         return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
279 |     def sample(self):
280 |         u = tf.random_uniform(tf.shape(self.ps))
281 |         return tf.to_float(math_ops.less(u, self.ps))
282 |     @classmethod
283 |     def fromflat(cls, flat):
284 |         return cls(flat)
285 | 
286 | def make_pdtype(ac_space):
287 |     from gym import spaces
288 |     if isinstance(ac_space, spaces.Box):
289 |         assert len(ac_space.shape) == 1
290 |         return DiagGaussianPdType(ac_space.shape[0])
291 |     elif isinstance(ac_space, spaces.Discrete):
292 |         return CategoricalPdType(ac_space.n)
293 |     elif isinstance(ac_space, spaces.MultiDiscrete):
294 |         return MultiCategoricalPdType(ac_space.nvec)
295 |     elif isinstance(ac_space, spaces.MultiBinary):
296 |         return BernoulliPdType(ac_space.n)
297 |     else:
298 |         raise NotImplementedError
299 | 
300 | def shape_el(v, i):
301 |     maybe = v.get_shape()[i]
302 |     if maybe is not None:
303 |         return maybe
304 |     else:
305 |         return tf.shape(v)[i]
306 | 
307 | '''
308 | @U.in_session
309 | def test_probtypes():
310 |     np.random.seed(0)
311 | 
312 |     pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
313 |     diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101
314 |     validate_probtype(diag_gauss, pdparam_diag_gauss)
315 | 
316 |     pdparam_categorical = np.array([-.2, .3, .5])
317 |     categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
318 |     validate_probtype(categorical, pdparam_categorical)
319 | 
320 |     nvec = [1,2,3]
321 |     pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
322 |     multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
323 |     validate_probtype(multicategorical, pdparam_multicategorical)
324 | 
325 |     pdparam_bernoulli = np.array([-.2, .3, .5])
326 |     bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
327 |     validate_probtype(bernoulli, pdparam_bernoulli)
328 | 
329 | 
330 | def validate_probtype(probtype, pdparam):
331 |     N = 100000
332 |     # Check to see if mean negative log likelihood == differential entropy
333 |     Mval = np.repeat(pdparam[None, :], N, axis=0)
334 |     M = probtype.param_placeholder([N])
335 |     X = probtype.sample_placeholder([N])
336 |     pd = probtype.pdfromflat(M)
337 |     calcloglik = U.function([X, M], pd.logp(X))
338 |     calcent = U.function([M], pd.entropy())
339 |     Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
340 |     logliks = calcloglik(Xval, Mval)
341 |     entval_ll = - logliks.mean() #pylint: disable=E1101
342 |     entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
343 |     entval = calcent(Mval).mean() #pylint: disable=E1101
344 |     assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas
345 | 
346 |     # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
347 |     M2 = probtype.param_placeholder([N])
348 |     pd2 = probtype.pdfromflat(M2)
349 |     q = pdparam + np.random.randn(pdparam.size) * 0.1
350 |     Mval2 = np.repeat(q[None, :], N, axis=0)
351 |     calckl = U.function([M, M2], pd.kl(pd2))
352 |     klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
353 |     logliks = calcloglik(Xval, Mval2)
354 |     klval_ll = - entval - logliks.mean() #pylint: disable=E1101
355 |     klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
356 |     assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
357 |     print('ok on', probtype, pdparam)
358 | '''
359 | 
360 | def _matching_fc(tensor, name, size, init_scale, init_bias):
361 |     if tensor.shape[-1] == size:
362 |         return tensor
363 |     else:
364 |         return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias)
365 | 


--------------------------------------------------------------------------------
/algorithms/distributions.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/distributions.pyc


--------------------------------------------------------------------------------
/algorithms/dql_utilities.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Mar 18 10:59:09 2019
 5 | 
 6 | @author: tawehbeysolow
 7 | """
 8 | 
 9 | import numpy as np
10 | from skimage import transform 
11 | from collections import deque 
12 | from vizdoom import *             
13 |           
14 | class Memory():
15 |     
16 |     def __init__(self, max_size):
17 |         self.buffer = deque(maxlen = max_size)
18 |     
19 |     def add(self, experience):
20 |         self.buffer.append(experience)
21 |     
22 |     def sample(self, batch_size):
23 |         buffer_size = len(self.buffer)
24 |         index = np.random.choice(np.arange(buffer_size),
25 |                                 size=batch_size,
26 |                                 replace=True)
27 |         
28 |         return [self.buffer[i] for i in index]
29 | 
30 | def create_environment(filepath='/Users/tawehbeysolow/Desktop/applied_rl_python/chapter3/'):
31 |     game = DoomGame()    
32 |     game.load_config(filepath+'basic.cfg')
33 |     game.set_doom_scenario_path(filepath+'basic.wad')    
34 |     game.init()
35 |     
36 |     left = [1, 0, 0]
37 |     right = [0, 1, 0]
38 |     shoot = [0, 0, 1]
39 |     possible_actions = [left, right, shoot]
40 |     return game, possible_actions
41 |  
42 | 
43 | def preprocess_frame(frame):
44 |     cropped_frame = frame[30:-10,30:-30]    
45 |     normalized_frame = cropped_frame/float(255)
46 |     preprocessed_frame = transform.resize(normalized_frame, [84,84])
47 |     return preprocessed_frame
48 | 
49 | def stack_frames(stacked_frames, state, new_episode, stack_size=4):
50 |     
51 |     frame = preprocess_frame(state)
52 |     
53 |     if new_episode == True:
54 |         
55 |         stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
56 |         for i in range(4):
57 |             stacked_frames.append(frame)
58 |         
59 |         stacked_state = np.stack(stacked_frames, axis=2)
60 |         
61 |     else:
62 |         
63 |         stacked_frames.append(frame)
64 |         stacked_state = np.stack(stacked_frames, axis=2) 
65 |     
66 |     return stacked_state, stacked_frames


--------------------------------------------------------------------------------
/algorithms/dql_utilities.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/dql_utilities.pyc


--------------------------------------------------------------------------------
/algorithms/order_book_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Mar 25 15:56:09 2019
 5 | 
 6 | @author: tawehbeysolow
 7 | """
 8 | 
 9 | from tgym.core import DataGenerator
10 | import numpy as np, csv
11 | 
12 | def remove_non_ascii(obj):
13 |     return ''.join([character for character in obj if ord(character) < 128])
14 |     
15 | class bid_ask_data(DataGenerator):
16 |     
17 |     def __init__(self, **gen_kwargs):
18 |         """Initialisation function. The API (gen_kwargs) should be defined in
19 |         the function _generator.
20 |         """
21 |         self._trainable = False
22 |         self.gen_kwargs = gen_kwargs
23 |         DataGenerator.rewind(self)
24 |         self.n_products = 1
25 |         DataGenerator.rewind(self)
26 |         
27 |     @staticmethod
28 |     def _generator():
29 |         
30 |         with open('/Users/tawehbeysolow/Downloads/amazon_order_book_data.csv', 'rU') as csvfile:
31 |             reader = csv.reader(csvfile)
32 |             for row in reader:
33 |                 row = [float(remove_non_ascii(_row))/ for _row in row]
34 |                 yield np.array(row, dtype=np.float)
35 | 
36 |     def _iterator_end(self):
37 |         """Rewinds if end of data reached.
38 |         """
39 |         print "End of data reached, rewinding."
40 |         super(self.__class__, self).rewind()
41 |     
42 |     
43 |     def next(self):
44 |         """Return the next element in the generator.
45 |         Args:
46 |             numpy.array: next row of the generator
47 |         """
48 |         try:
49 |             return next(self.generator)
50 |         except StopIteration as e:
51 |             self._iterator_end()
52 |             raise(e)
53 | 
54 |     def rewind(self):
55 |         """Rewind the generator.
56 |         """
57 |         self.generator = self._generator()
58 | 
59 | 
60 | if __name__ == '__main__':
61 | 
62 | 
63 |     generator = bid_ask_data(filename='amazon_order_book_data.csv', filepath='/Users/tawehbeysolow/Downloads/')
64 |     prices_time_series = [next(generator.preprocess()) for _ in range(100)]
65 |     import pdb; pdb.set_trace()
66 | 


--------------------------------------------------------------------------------
/algorithms/order_book_data.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/order_book_data.pyc


--------------------------------------------------------------------------------
/algorithms/policy_gradient_utilities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Mar 16 06:54:29 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import tensorflow as tf, numpy as np
 10 | from baselines.a2c.utils import cat_entropy, mse
 11 | 
 12 | class Model(object):
 13 |     """
 14 |     We use this object to :
 15 |     __init__:
 16 |     - Creates the step_model
 17 |     - Creates the train_model
 18 |     train():
 19 |     - Make the training part (feedforward and retropropagation of gradients)
 20 |     save/load():
 21 |     - Save load the model
 22 |     """
 23 |     def __init__(self,
 24 |                  policy,
 25 |                 ob_space,
 26 |                 action_space,
 27 |                 nenvs,
 28 |                 nsteps,
 29 |                 ent_coef,
 30 |                 vf_coef,
 31 |                 max_grad_norm):
 32 | 
 33 |         sess = tf.get_default_session()
 34 | 
 35 |         # Here we create the placeholders
 36 |         actions_ = tf.placeholder(tf.int32, [None], name="actions_")
 37 |         advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
 38 |         rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
 39 |         lr_ = tf.placeholder(tf.float32, name="learning_rate_")
 40 | 
 41 |         # Here we create our two models:
 42 |         # Step_model that is used for sampling
 43 |         step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False)
 44 | 
 45 |         # Train model for training
 46 |         train_model = policy(sess, ob_space, action_space, nenvs*nsteps, nsteps, reuse=True)
 47 | 
 48 |         """
 49 |         Calculate the loss
 50 |         Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
 51 |         """
 52 |         # Policy loss
 53 |         # Output -log(pi)
 54 |         neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_)
 55 | 
 56 |         # 1/n * sum A(si,ai) * -logpi(ai|si)
 57 |         pg_loss = tf.reduce_mean(advantages_ * neglogpac)
 58 | 
 59 |         # Value loss 1/2 SUM [R - V(s)]^2
 60 |         vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf),rewards_))
 61 | 
 62 |         # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
 63 |         entropy = tf.reduce_mean(train_model.pd.entropy())
 64 | 
 65 | 
 66 |         loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
 67 | 
 68 |         # Update parameters using loss
 69 |         # 1. Get the model parameters
 70 |         params = find_trainable_variables("model")
 71 | 
 72 |         # 2. Calculate the gradients
 73 |         gradients = tf.gradients(loss, params)
 74 |         if max_grad_norm is not None:
 75 |             # Clip the gradients (normalize)
 76 |             gradients, grad_norm = tf.clip_by_global_norm(gradients, max_grad_norm)
 77 |             
 78 |         gradients = list(zip(gradients, params))
 79 |         # zip aggregate each gradient with parameters associated
 80 |         # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
 81 | 
 82 |         # 3. Build our trainer
 83 |         trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, decay=0.99, epsilon=1e-5)
 84 | 
 85 |         # 4. Backpropagation
 86 |         _train = trainer.apply_gradients(gradients)
 87 | 
 88 |         def train(states_in, actions, returns, values, lr):
 89 |             advantages = returns - values
 90 | 
 91 |             # We create the feed dictionary
 92 |             td_map = {train_model.inputs_: states_in,
 93 |                      actions_: actions,
 94 |                      advantages_: advantages, # Use to calculate our policy loss
 95 |                      rewards_: returns, # Use as a bootstrap for real value
 96 |                      lr_: lr}
 97 | 
 98 |             policy_loss, value_loss, policy_entropy, _= sess.run([pg_loss, vf_loss, entropy, _train], td_map)
 99 |             
100 |             return policy_loss, value_loss, policy_entropy
101 | 
102 | 
103 |         def save(save_path):
104 |             """
105 |             Save the model
106 |             """
107 |             saver = tf.train.Saver()
108 |             saver.save(sess, save_path)
109 | 
110 |         def load(load_path):
111 |             """
112 |             Load the model
113 |             """
114 |             saver = tf.train.Saver()
115 |             print('Loading ' + load_path)
116 |             saver.restore(sess, load_path)
117 | 
118 |         self.train = train
119 |         self.train_model = train_model
120 |         self.step_model = step_model
121 |         self.step = step_model.step
122 |         self.value = step_model.value
123 |         self.initial_state = step_model.initial_state
124 |         self.save = save
125 |         self.load = load
126 |         tf.global_variables_initializer().run(session=sess)
127 | 
128 | class Runner(AbstractEnvRunner):
129 |     """
130 |     We use this object to make a mini batch of experiences
131 |     
132 |     __init__:
133 |     - Initialize the runner
134 |     run():
135 |         
136 |     - Make a mini batch
137 |     """
138 |     def __init__(self, env, model, nsteps, total_timesteps, gamma, lam):
139 |         super().__init__(env = env, model = model, nsteps = nsteps)
140 | 
141 |         # Discount rate
142 |         self.gamma = gamma
143 | 
144 |         # Lambda used in GAE (General Advantage Estimation)
145 |         self.lam = lam
146 | 
147 |         # Total timesteps taken
148 |         self.total_timesteps = total_timesteps
149 | 
150 |     def run(self):
151 |         # Here, we init the lists that will contain the mb of experiences
152 |         mb_obs, mb_actions, mb_rewards, mb_values, mb_dones = [],[],[],[],[]
153 | 
154 |         # For n in range number of steps
155 |         for n in range(self.nsteps):
156 |             # Given observations, take action and value (V(s))
157 |             # We already have self.obs because AbstractEnvRunner run self.obs[:] = env.reset()
158 |             actions, values = self.model.step(self.obs, self.dones)
159 | 
160 |             #print("actions runner runner", actions)
161 | 
162 |             # Append the observations into the mb
163 |             mb_obs.append(np.copy(self.obs)) #obs len nenvs (1 step per env)
164 | 
165 |             # Append the actions taken into the mb
166 |             mb_actions.append(actions)
167 | 
168 |             # Append the values calculated into the mb
169 |             mb_values.append(values)
170 | 
171 |             # Append the dones situations into the mb
172 |             mb_dones.append(self.dones)
173 | 
174 |             # Take actions in env and look the results
175 |             self.obs[:], rewards, self.dones, _ = self.env.step(actions)
176 | 
177 |             mb_rewards.append(rewards)
178 | 
179 |         #batch of steps to batch of rollouts
180 |         mb_obs = np.asarray(mb_obs, dtype=np.uint8)
181 |         mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
182 |         mb_actions = np.asarray(mb_actions, dtype=np.int32)
183 |         mb_values = np.asarray(mb_values, dtype=np.float32)
184 |         mb_dones = np.asarray(mb_dones, dtype=np.bool)
185 |         last_values = self.model.value(self.obs)
186 |           
187 | 
188 |         ### GENERALIZED ADVANTAGE ESTIMATION
189 |         # discount/bootstrap off value fn
190 |         # We create mb_returns and mb_advantages
191 |         # mb_returns will contain Advantage + value
192 |         mb_returns = np.zeros_like(mb_rewards)
193 |         mb_advantages = np.zeros_like(mb_rewards)
194 | 
195 |         lastgaelam = 0
196 | 
197 |         # From last step to first step
198 |         for t in reversed(range(self.nsteps)):
199 |             # If t == before last step
200 |             if t == self.nsteps - 1:
201 |                 # If a state is done, nextnonterminal = 0
202 |                 # In fact nextnonterminal allows us to do that logic
203 | 
204 |                 #if done (so nextnonterminal = 0):
205 |                 #    delta = R - V(s) (because self.gamma * nextvalues * nextnonterminal = 0) 
206 |                 # else (not done)
207 |                     #delta = R + gamma * V(st+1)
208 |                 nextnonterminal = 1.0 - self.dones
209 |                 
210 |                 # V(t+1)
211 |                 nextvalues = last_values
212 |             else:
213 |                 nextnonterminal = 1.0 - mb_dones[t+1]
214 |                 
215 |                 nextvalues = mb_values[t+1]
216 | 
217 |             # Delta = R(st) + gamma * V(t+1) * nextnonterminal  - V(st)
218 |             delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
219 | 
220 |             # Advantage = delta + gamma *  λ (lambda) * nextnonterminal  * lastgaelam
221 |             mb_advantages[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
222 | 
223 |         # Returns
224 |         mb_returns = mb_advantages + mb_values
225 | 
226 |         return map(swap_flatten_axes, (mb_obs, mb_actions, mb_returns, mb_values))
227 | 
228 | 
229 | def swap_flatten_axes(arr):
230 |     """
231 |     swap and then flatten axes 0 and 1
232 |     """
233 |     s = arr.shape
234 |     return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
235 | 


--------------------------------------------------------------------------------
/algorithms/sarsa_algorithm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Mar  8 13:16:38 2019
 5 | 
 6 | @author: tawehbeysolow
 7 | """
 8 | 
 9 | from collections import defaultdict
10 | import numpy as np
11 | 
12 | class EligibilityTrace(object):
13 |     """class containing logic for SARSA-lambda eligibility traces
14 |         this is basically a wrapper for a dict that 
15 |             1) clips its values to lie in the interval [0, 1]
16 |             2) updates all values by a decay constant and throws out those
17 |                 that fall below some threshold
18 |     """
19 |     def __init__(self, decay, threshold):
20 |         self.decay = decay
21 |         self.threshold = threshold
22 |         self.data = defaultdict(float)
23 | 
24 |     def __getitem__(self, key):
25 |         return self.data[key]
26 | 
27 |     def __setitem__(self, key, val):
28 |         self.data[key] = np.clip(val, 0, 1)
29 | 
30 |     def iteritems(self):
31 |         return self.data.iteritems()
32 | 
33 |     def update(self):
34 |         for key in self.data.keys():
35 |             if self.data[key] < self.threshold:
36 |                 del self.data[key]
37 |             else:
38 |                 self.data[key] = self.data[key] * self.decay
39 | 
40 | 
41 | class SARSA(Agent):
42 |     """impementation of SARSA lambda algorithm.
43 |         class SARSA is equivilant to this with lambda = 0, but 
44 |         we seperate the two out because
45 |             1) it's nice to juxtapose the two algorithms side-by-side
46 |             2) SARSA lambda incurrs the overhead of maintaining
47 |                 eligibility traces
48 |         note that the algorithm isn't explicitly parameterized with lambda.
49 |             instead, we provide a decay rate and threshold. On each iteration,
50 |             the decay is applied all rewards in the eligibility trace. Those 
51 |             past rewards who have decayed below the threshold are dropped
52 |     """
53 |     def __init__(self, featureExtractor, max_gradient, epsilon=0.5, gamma=0.993, stepSize=None, threshold=0.1, decay=0.98):
54 |         super(SARSA, self).__init__(featureExtractor, epsilon, gamma, stepSize, max_gradient)
55 |         self.eligibility_trace = EligibilityTrace(decay, threshold)
56 | 
57 |     def update_q_matrix(self, state, action, reward, newState):
58 |         """performs a SARSA update. Leverages the eligibility trace to update 
59 |             parameters towards sum of discounted rewards
60 |         """
61 |         self.eligibility_trace.update()
62 |         prediction = self.getQ(state, action)
63 |         newAction = None
64 |         target = reward
65 |         for f, v in self.featureExtractor.get_features(state, action).iteritems():
66 |             self.eligibility_trace[f] += v
67 | 
68 |         if newState != None:
69 |             newAction = self.takeAction(newState)
70 |             target += self.discount * self.getQ(newState, newAction)
71 | 
72 |         update = self.getStepSize(self.numIters) * (prediction - target)
73 |         # clip gradient - TODO EXPORT TO UTILS?
74 |         update = max(-self.max_gradient, update) if update < 0 else min(self.max_gradient, update)
75 | 
76 |         for key, eligibility in self.eligibility_trace.iteritems():
77 |             self.weights[key] -= update * eligibility
78 |         return newAction
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/algorithms/trading.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Jun 18 15:41:31 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import matplotlib as mpl
 10 | import matplotlib.pyplot as plt
 11 | import numpy as np
 12 | from tgym.core import Env
 13 | from tgym.utils import calc_spread
 14 | 
 15 | plt.style.use('dark_background')
 16 | mpl.rcParams.update(
 17 |     {
 18 |         "font.size": 15,
 19 |         "axes.labelsize": 15,
 20 |         "lines.linewidth": 1,
 21 |         "lines.markersize": 8
 22 |     }
 23 | )
 24 | 
 25 | 
 26 | class SpreadTrading(Env):
 27 |     """Class for a discrete (buy/hold/sell) spread trading environment.
 28 |     """
 29 | 
 30 |     _actions = {
 31 |         'hold': np.array([1, 0, 0]),
 32 |         'buy': np.array([0, 1, 0]),
 33 |         'sell': np.array([0, 0, 1])
 34 |     }
 35 | 
 36 |     _positions = {
 37 |         'flat': np.array([1, 0, 0]),
 38 |         'long': np.array([0, 1, 0]),
 39 |         'short': np.array([0, 0, 1])
 40 |     }
 41 | 
 42 |     def __init__(self, data_generator, spread_coefficients, episode_length=1000, trading_fee=0, time_fee=0, history_length=2):
 43 |         """Initialisation function
 44 | 
 45 |         Args:
 46 |             data_generator (tgym.core.DataGenerator): A data
 47 |                 generator object yielding a 1D array of bid-ask prices.
 48 |             spread_coefficients (list): A list of signed integers defining
 49 |                 how much of each product to buy (positive) or sell (negative)
 50 |                 when buying or selling the spread.
 51 |             episode_length (int): number of steps to play the game for
 52 |             trading_fee (float): penalty for trading
 53 |             time_fee (float): time fee
 54 |             history_length (int): number of historical states to stack in the
 55 |                 observation vector.
 56 |         """
 57 | 
 58 |         assert data_generator.n_products == len(spread_coefficients)
 59 |         assert history_length > 0
 60 |         self._data_generator = data_generator
 61 |         self._spread_coefficients = spread_coefficients
 62 |         self._first_render = True
 63 |         self._trading_fee = trading_fee
 64 |         self._time_fee = time_fee
 65 |         self._episode_length = episode_length
 66 |         self.n_actions = 3
 67 |         self._prices_history = []
 68 |         self._history_length = history_length
 69 |         self.reset()
 70 | 
 71 |     def reset(self):
 72 |         """Reset the trading environment. Reset rewards, data generator...
 73 | 
 74 |         Returns:
 75 |             observation (numpy.array): observation of the state
 76 |         """
 77 |         self._iteration = 0
 78 |         self._data_generator.rewind()
 79 |         self._total_reward = 0
 80 |         self._total_pnl = 0
 81 |         self._position = self._positions['flat']
 82 |         self._entry_price = 0
 83 |         self._exit_price = 0
 84 |         self._closed_plot = False
 85 | 
 86 |         for i in range(self._history_length):
 87 |             self._prices_history.append(self._data_generator.next())
 88 | 
 89 |         observation = self._get_observation()
 90 |         self.state_shape = observation.shape
 91 |         self._action = self._actions['hold']
 92 |         return observation
 93 | 
 94 |     def step(self, action):
 95 |         """Take an action (buy/sell/hold) and computes the immediate reward.
 96 | 
 97 |         Args:
 98 |             action (numpy.array): Action to be taken, one-hot encoded.
 99 | 
100 |         Returns:
101 |             tuple:
102 |                 - observation (numpy.array): Agent's observation of the current environment.
103 |                 - reward (float) : Amount of reward returned after previous action.
104 |                 - done (bool): Whether the episode has ended, in which case further step() calls will return undefined results.
105 |                 - info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
106 | 
107 |         """
108 | 
109 |         assert any([(action == x).all() for x in self._actions.values()])
110 |         self._action = action
111 |         self._iteration += 1
112 |         done = False
113 |         instant_pnl = 0
114 |         info = {}
115 |         reward = -self._time_fee
116 |         if all(action == self._actions['buy']):
117 |             reward -= self._trading_fee
118 |             if all(self._position == self._positions['flat']):
119 |                 self._position = self._positions['long']
120 |                 self._entry_price = calc_spread(
121 |                     self._prices_history[-1], self._spread_coefficients)[1]  # Ask
122 |             elif all(self._position == self._positions['short']):
123 |                 self._exit_price = calc_spread(
124 |                     self._prices_history[-1], self._spread_coefficients)[1]  # Ask
125 |                 instant_pnl = self._entry_price - self._exit_price
126 |                 self._position = self._positions['flat']
127 |                 self._entry_price = 0
128 |         elif all(action == self._actions['sell']):
129 |             reward -= self._trading_fee
130 |             if all(self._position == self._positions['flat']):
131 |                 self._position = self._positions['short']
132 |                 self._entry_price = calc_spread(
133 |                     self._prices_history[-1], self._spread_coefficients)[0]  # Bid
134 |             elif all(self._position == self._positions['long']):
135 |                 self._exit_price = calc_spread(
136 |                     self._prices_history[-1], self._spread_coefficients)[0]  # Bid
137 |                 instant_pnl = self._exit_price - self._entry_price
138 |                 self._position = self._positions['flat']
139 |                 self._entry_price = 0
140 | 
141 |         reward += instant_pnl
142 |         self._total_pnl += instant_pnl
143 |         self._total_reward += reward
144 | 
145 |         # Game over logic
146 |         try:
147 |             self._prices_history.append(self._data_generator.next())
148 |         except StopIteration:
149 |             done = True
150 |             info['status'] = 'No more data.'
151 |         if self._iteration >= self._episode_length:
152 |             done = True
153 |             info['status'] = 'Time out.'
154 |         if self._closed_plot:
155 |             info['status'] = 'Closed plot'
156 | 
157 |         observation = self._get_observation()
158 |         return observation, reward, done, info
159 |     
160 |     def _handle_close(self, evt):
161 |         self._closed_plot = True
162 | 
163 |     def render(self, savefig=False, filename='myfig'):
164 |         """Matlplotlib rendering of each step.
165 | 
166 |         Args:
167 |             savefig (bool): Whether to save the figure as an image or not.
168 |             filename (str): Name of the image file.
169 |         """
170 |         if self._first_render:
171 |             self._f, self._ax = plt.subplots(
172 |                 len(self._spread_coefficients) + int(len(self._spread_coefficients) > 1),
173 |                 sharex=True
174 |             )
175 |             if len(self._spread_coefficients) == 1:
176 |                 self._ax = [self._ax]
177 |             self._f.set_size_inches(12, 6)
178 |             self._first_render = False
179 |             self._f.canvas.mpl_connect('close_event', self._handle_close)
180 |         if len(self._spread_coefficients) > 1:
181 |             # TODO: To be checked
182 |             for prod_i in range(len(self._spread_coefficients)):
183 |                 bid = self._prices_history[-1][2 * prod_i]
184 |                 ask = self._prices_history[-1][2 * prod_i + 1]
185 |                 self._ax[prod_i].plot([self._iteration, self._iteration + 1],
186 |                                       [bid, bid], color='white')
187 |                 self._ax[prod_i].plot([self._iteration, self._iteration + 1],
188 |                                       [ask, ask], color='white')
189 |                 self._ax[prod_i].set_title('Product {} (spread coef {})'.format(
190 |                     prod_i, str(self._spread_coefficients[prod_i])))
191 | 
192 |         # Spread price
193 |         prices = self._prices_history[-1]
194 |         bid, ask = calc_spread(prices, self._spread_coefficients)
195 |         self._ax[-1].plot([self._iteration, self._iteration + 1],
196 |                           [bid, bid], color='white')
197 |         self._ax[-1].plot([self._iteration, self._iteration + 1],
198 |                           [ask, ask], color='white')
199 |         ymin, ymax = self._ax[-1].get_ylim()
200 |         yrange = ymax - ymin
201 |         if (self._action == self._actions['sell']).all():
202 |             self._ax[-1].scatter(self._iteration + 0.5, bid + 0.03 *
203 |                                  yrange, color='orangered', marker='v')
204 |         elif (self._action == self._actions['buy']).all():
205 |             self._ax[-1].scatter(self._iteration + 0.5, ask - 0.03 *
206 |                                  yrange, color='lawngreen', marker='^')
207 |         plt.suptitle('Cumulated Reward: ' + "%.2f" % self._total_reward + ' ~ ' +
208 |                      'Cumulated PnL: ' + "%.2f" % self._total_pnl + ' ~ ' +
209 |                      'Position: ' + ['flat', 'long', 'short'][list(self._position).index(1)] + ' ~ ' +
210 |                      'Entry Price: ' + "%.2f" % self._entry_price)
211 |         self._f.tight_layout()
212 |         plt.xticks(range(self._iteration)[::5])
213 |         plt.xlim([max(0, self._iteration - 80.5), self._iteration + 0.5])
214 |         plt.subplots_adjust(top=0.85)
215 |         plt.pause(0.01)
216 |         if savefig:
217 |             plt.savefig(filename)
218 | 
219 |     def _get_observation(self):
220 |         """Concatenate all necessary elements to create the observation.
221 | 
222 |         Returns:
223 |             numpy.array: observation array.
224 |         """
225 |         return np.concatenate(
226 |             [prices for prices in self._prices_history[-self._history_length:]] +
227 |             [
228 |                 np.array([self._entry_price]),
229 |                 np.array(self._position)
230 |             ]
231 |         )
232 | 
233 |     @staticmethod
234 |     def random_action_fun():
235 |         """The default random action for exploration.
236 |         We hold 80% of the time and buy or sell 10% of the time each.
237 | 
238 |         Returns:
239 |             numpy.array: array with a 1 on the action index, 0 elsewhere.
240 |         """
241 |         return np.random.multinomial(1, [0.8, 0.1, 0.1])
242 | 


--------------------------------------------------------------------------------
/algorithms/trading.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/trading.pyc


--------------------------------------------------------------------------------
/chapter1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter1/__init__.py


--------------------------------------------------------------------------------
/chapter1/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter1/__init__.pyc


--------------------------------------------------------------------------------
/chapter1/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter1/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/chapter1/__pycache__/open_ai_gym_example.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter1/__pycache__/open_ai_gym_example.cpython-36.pyc


--------------------------------------------------------------------------------
/chapter1/open_ai_gym_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Jan 28 23:18:17 2019
 5 | 
 6 | @author: tawehbeysolow
 7 | """
 8 | 
 9 | import gym
10 | 
11 | def cartpole():
12 |     environment = gym.make('CartPole-v1')
13 |     environment.reset()
14 |     for _ in range(1000):
15 |         environment.render()
16 |         action = environment.action_space.sample()
17 |         observation, reward, done, info = environment.step(action)
18 |         print("Step {}:".format(_))
19 |         print("action: {}".format(action))
20 |         print("observation: {}".format(observation))
21 |         print("reward: {}".format(reward))
22 |         print("done: {}".format(done))
23 |         print("info: {}".format(info))
24 |     
25 | if __name__ == '__main__':
26 |     
27 |     cartpole()


--------------------------------------------------------------------------------
/chapter2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/.DS_Store


--------------------------------------------------------------------------------
/chapter2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/__init__.py


--------------------------------------------------------------------------------
/chapter2/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/__init__.pyc


--------------------------------------------------------------------------------
/chapter2/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/chapter2/__pycache__/super_mario_example.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/__pycache__/super_mario_example.cpython-36.pyc


--------------------------------------------------------------------------------
/chapter2/cart_pole_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Feb 20 13:50:58 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import gym, numpy as np, matplotlib.pyplot as plt
 10 | from neural_networks.policy_gradient_utilities import PolicyGradient
 11 | 
 12 | #Parameters 
 13 | n_units = 5
 14 | gamma = .99
 15 | batch_size = 50
 16 | learning_rate = 1e-3
 17 | n_episodes = 10000
 18 | render = False
 19 | goal = 190
 20 | n_layers = 2
 21 | n_classes = 2
 22 | environment = gym.make('CartPole-v1')
 23 | environment_dimension = len(environment.reset())
 24 |             
 25 | def calculate_discounted_reward(reward, gamma=gamma):
 26 |     output = [reward[i] * gamma**i for i in range(0, len(reward))]
 27 |     return output[::-1]
 28 | 
 29 | def score_model(model, n_tests, render=render):
 30 |     scores = []    
 31 |     for _ in range(n_tests):
 32 |         environment.reset()
 33 |         observation = environment.reset()
 34 |         reward_sum = 0
 35 |         while True:
 36 |             if render:
 37 |                 environment.render()
 38 |                 
 39 |             state = np.reshape(observation, [1, environment_dimension])
 40 |             predict = model.predict([state])[0]
 41 |             action = np.argmax(predict)
 42 |             observation, reward, done, _ = environment.step(action)
 43 |             reward_sum += reward
 44 |             if done:
 45 |                 break
 46 |         scores.append(reward_sum)
 47 |         
 48 |     environment.close()
 49 |     return np.mean(scores)
 50 | 
 51 | def cart_pole_game(environment, policy_model, model_predictions):
 52 |     loss = []
 53 |     n_episode, reward_sum, score, episode_done = 0, 0, 0, False
 54 |     n_actions = environment.action_space.n
 55 |     observation = environment.reset()
 56 |     
 57 |     states = np.empty(0).reshape(0, environment_dimension)
 58 |     actions = np.empty(0).reshape(0, 1)
 59 |     rewards = np.empty(0).reshape(0, 1)
 60 |     discounted_rewards = np.empty(0).reshape(0, 1)
 61 |     
 62 |     while n_episode < n_episodes: 
 63 |          
 64 |         state = np.reshape(observation, [1, environment_dimension])        
 65 |         prediction = model_predictions.predict([state])[0]
 66 |         action = np.random.choice(range(environment.action_space.n), p=prediction)
 67 |         states = np.vstack([states, state])
 68 |         actions = np.vstack([actions, action])
 69 |         
 70 |         observation, reward, episode_done, info = environment.step(action)
 71 |         reward_sum += reward
 72 |         rewards = np.vstack([rewards, reward])
 73 | 
 74 |         if episode_done == True:
 75 |             
 76 |             discounted_reward = calculate_discounted_reward(rewards)
 77 |             discounted_rewards = np.vstack([discounted_rewards, discounted_reward])
 78 |             rewards = np.empty(0).reshape(0, 1)
 79 |             
 80 |             if (n_episode + 1) % batch_size == 0:
 81 |                 
 82 |                 discounted_rewards -= discounted_rewards.mean()
 83 |                 discounted_rewards /= discounted_rewards.std()
 84 |                 discounted_rewards = discounted_rewards.squeeze()
 85 |                 actions = actions.squeeze().astype(int)
 86 |                 
 87 |                 train_actions = np.zeros([len(actions), n_actions])
 88 |                 train_actions[np.arange(len(actions)), actions] = 1
 89 |                 
 90 |                 error = policy_model.train_on_batch([states, discounted_rewards], train_actions)
 91 |                 loss.append(error)
 92 |                 
 93 |                 states = np.empty(0).reshape(0, environment_dimension)
 94 |                 actions = np.empty(0).reshape(0, 1)
 95 |                 discounted_rewards = np.empty(0).reshape(0, 1)
 96 |                                 
 97 |                 score = score_model(model=model_predictions, n_tests=10)
 98 |                 
 99 |                 print('''\nEpisode: %s \nAverage Reward: %s  \nScore: %s \nError: %s'''
100 |                       )%(n_episode+1, reward_sum/float(batch_size), score, np.mean(loss[-batch_size:]))
101 |     
102 |                 if score >= goal: 
103 |                     break 
104 |                 
105 |                 reward_sum = 0
106 |                 
107 |             n_episode += 1
108 |             observation = environment.reset()
109 |             
110 |     plt.title('Policy Gradient Error plot over %s Episodes'%(n_episode+1))
111 |     plt.xlabel('N batches')
112 |     plt.ylabel('Error Rate')
113 |     plt.plot(loss)
114 |     plt.show()
115 |     
116 | if __name__ == '__main__':
117 |         
118 |     
119 |     mlp_model = PolicyGradient(n_units=n_units, 
120 |                               n_layers=n_layers, 
121 |                               n_columns=environment_dimension, 
122 |                               n_outputs=n_classes, 
123 |                               learning_rate=learning_rate, 
124 |                               hidden_activation='selu', 
125 |                               output_activation='softmax',
126 |                               loss_function='log_likelihood')
127 |         
128 |     policy_model, model_predictions = mlp_model.create_policy_model(input_shape=(environment_dimension, ))
129 |     
130 |     policy_model.summary()
131 |     
132 |     cart_pole_game(environment=environment, 
133 |                    policy_model=policy_model, 
134 |                    model_predictions=model_predictions)
135 |     


--------------------------------------------------------------------------------
/chapter2/cart_pole_example.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/cart_pole_example.pyc


--------------------------------------------------------------------------------
/chapter2/super_mario_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Mar 10 21:00:57 2019
 5 | 
 6 | @author: tawehbeysolow
 7 | """
 8 | 
 9 | import numpy as np
10 | from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
11 | import gym_super_mario_bros
12 | from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
13 | from algorithms.actor_critic_utilities import train_model
14 | from neural_networks.models import ActorCriticModel
15 | 
16 | #Parameters
17 | environment = gym_super_mario_bros.make('SuperMarioBros-v0')
18 | environment = BinarySpaceToDiscreteSpaceEnv(environment, SIMPLE_MOVEMENT)
19 | observation = environment.reset()
20 | learning_rate = 1e-4
21 | gamma = 0.96
22 | epsilon = 0.9
23 | n_episodes = 10000
24 | n_steps = 2048
25 | max_steps = int(1e7)
26 | _lambda = 0.95
27 | value_coefficient = 0.5
28 | entropy_coefficient = 0.01
29 | max_grad_norm = 0.5
30 | log_interval = 10
31 | 
32 | def play_super_mario(model, environment=environment):
33 |      
34 |     observations = environment.reset()
35 |     score, n_step, done = 0, 0, False
36 |     scores = []
37 |     
38 |     for _ in range(100):
39 |         
40 |         while done:
41 |             
42 |             actions, values = model.step(observations)        
43 |             observations, rewards, done, info = environment.step(actions)
44 |             score += rewards    
45 |             environment.render()        
46 |             n_step += 1
47 |             scores.append(score)
48 |                         
49 |         print('Step: %s \nScore: %s '%(n_step, score))
50 |         environment.reset()
51 |     
52 |     print(np.mean(scores))
53 | 
54 | if __name__ == '__main__':
55 |     
56 |     model = train_model(policy_model=ActorCriticModel,
57 |                         environment=environment, 
58 |                         n_steps=n_steps, 
59 |                         max_steps=max_steps, 
60 |                         gamma=gamma, 
61 |                         _lambda=_lambda,
62 |                         value_coefficient=value_coefficient, 
63 |                         entropy_coefficient=entropy_coefficient, 
64 |                         learning_rate=learning_rate, 
65 |                         max_grad_norm=max_grad_norm, 
66 |                         log_interval=log_interval)
67 |     
68 |     play_super_mario(model=model,
69 |                      environment=environment)


--------------------------------------------------------------------------------
/chapter3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__init__.py


--------------------------------------------------------------------------------
/chapter3/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__init__.pyc


--------------------------------------------------------------------------------
/chapter3/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/chapter3/__pycache__/doom_example.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__pycache__/doom_example.cpython-36.pyc


--------------------------------------------------------------------------------
/chapter3/__pycache__/frozen_lake_example.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__pycache__/frozen_lake_example.cpython-36.pyc


--------------------------------------------------------------------------------
/chapter3/basic.cfg:
--------------------------------------------------------------------------------
 1 | doom_scenario_path = basic.wad
 2 | doom_map = map01
 3 | 
 4 | # Rewards
 5 | living_reward = -1
 6 | 
 7 | # Rendering options
 8 | screen_resolution = RES_160X120
 9 | screen_format = GRAY8
10 | render_hud = True
11 | render_crosshair = false
12 | render_weapon = true
13 | render_decals = true
14 | render_particles = true
15 | window_visible = true
16 | 
17 | # make episodes start after 20 tics (after unholstering the gun)
18 | episode_start_time = 14
19 | 
20 | # make episodes finish after 300 actions (tics)
21 | episode_timeout = 300
22 | 
23 | # Available buttons
24 | available_buttons = 
25 | 	{ 
26 | 		MOVE_LEFT 
27 | 		MOVE_RIGHT 
28 | 		ATTACK 
29 | 	}
30 | 
31 | # Game variables that will be in the state
32 | available_game_variables = { AMMO2}
33 | 
34 | mode = PLAYER
35 | doom_skill = 5
36 | 


--------------------------------------------------------------------------------
/chapter3/basic.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/basic.wad


--------------------------------------------------------------------------------
/chapter3/doom_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Mar 18 10:50:31 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import warnings, random, time, tensorflow as tf, numpy as np, matplotlib.pyplot as plt  
 10 | from neural_networks.models import DeepQNetwork
 11 | from algorithms.dql_utilities import create_environment, stack_frames, Memory
 12 | from chapter3.frozen_lake_example import exploit_explore
 13 | from collections import deque 
 14 | 
 15 | #Parameters
 16 | stack_size = 4
 17 | gamma = 0.95
 18 | memory_size = int(1e7)
 19 | train = True
 20 | episode_render = False
 21 | n_units = 500
 22 | n_classes = 3
 23 | learning_rate = 2e-4
 24 | stride = 4 
 25 | kernel = 8
 26 | n_filters = 3
 27 | n_episodes = 1
 28 | max_steps = 100
 29 | batch_size = 64 
 30 | environment, possible_actions = create_environment()
 31 | state_size = [84, 84, 4]
 32 | action_size = 3 #environment.get_avaiable_buttons_size()
 33 | explore_start = 1.0
 34 | explore_stop = 0.01
 35 | decay_rate = 1e-4
 36 | pretrain_length = batch_size
 37 | warnings.filterwarnings('ignore')
 38 | #writer = tf.summary.FileWriter("/tensorboard/dqn/1")
 39 | write_op = tf.summary.merge_all()
 40 | 
 41 | def exploit_explore(session, model, explore_start, explore_stop, decay_rate, decay_step, state, actions):
 42 |     exp_exp_tradeoff = np.random.rand()
 43 |     explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
 44 |     
 45 |     if (explore_probability > exp_exp_tradeoff):
 46 |         action = random.choice(possible_actions)
 47 |         
 48 |     else:
 49 |         Qs = session.run(model.output, feed_dict = {model.input_matrix: state.reshape((1, *state.shape))})
 50 |         choice = np.argmax(Qs)
 51 |         action = possible_actions[int(choice)]
 52 |                 
 53 |     return action, explore_probability
 54 | 
 55 | def train_model(model, environment):
 56 |     tf.summary.scalar('Loss', model.error_rate)
 57 |     saver = tf.train.Saver()
 58 |     stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4) 
 59 |     memory = Memory(max_size=memory_size)
 60 |     scores = []
 61 |             
 62 |     with tf.Session() as sess:
 63 |         sess.run(tf.global_variables_initializer())
 64 |         decay_step = 0
 65 |         environment.init()
 66 | 
 67 |         for episode in range(n_episodes):
 68 |             step, reward_sum = 0, []
 69 |             environment.new_episode()
 70 |             state = environment.get_state().screen_buffer
 71 |             state, stacked_frames = stack_frames(stacked_frames, state, True)
 72 | 
 73 |             while step < max_steps:
 74 |                 step += 1; decay_step += 1
 75 |                 
 76 |                 action, explore_probability = exploit_explore(session=sess,
 77 |                                                               model=model,
 78 |                                                               explore_start=explore_start, 
 79 |                                                               explore_stop=explore_stop, 
 80 |                                                               decay_rate=decay_rate, 
 81 |                                                               decay_step=decay_step, 
 82 |                                                               state=state, 
 83 |                                                               actions=possible_actions)
 84 |                     
 85 |                 reward = environment.make_action(action)
 86 |                 done = environment.is_episode_finished()
 87 |                 reward_sum.append(reward)
 88 | 
 89 |                 if done:
 90 |                     
 91 |                     next_state = np.zeros((84,84), dtype=np.int)
 92 |                     
 93 |                     next_state, stacked_frames = stack_frames(stacked_frames=stacked_frames, 
 94 |                                                               state=next_state, 
 95 |                                                               new_episode=False)
 96 |                     step = max_steps
 97 |                     
 98 |                     total_reward = np.sum(reward_sum)
 99 |                     
100 |                     scores.append(total_reward)
101 |                     
102 |                     
103 |                     print('Episode: {}'.format(episode),
104 |                               'Total reward: {}'.format(total_reward),
105 |                               'Explore P: {:.4f}'.format(explore_probability))
106 | 
107 |                     memory.add((state, action, reward, next_state, done))
108 | 
109 |                 else:
110 |                     next_state = environment.get_state().screen_buffer
111 |                     next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
112 |                     memory.add((state, action, reward, next_state, done))
113 |                     state = next_state
114 | 
115 | 
116 |                 batch = memory.sample(batch_size)
117 |                 states = np.array([each[0] for each in batch], ndmin=3)
118 |                 actions = np.array([each[1] for each in batch])
119 |                 rewards = np.array([each[2] for each in batch]) 
120 |                 next_states = np.array([each[3] for each in batch], ndmin=3)
121 |                 dones = np.array([each[4] for each in batch])
122 | 
123 |                 target_Qs_batch = []
124 | 
125 |                 Qs_next_state = sess.run(model.predicted_Q, feed_dict={model.input_matrix: next_states, model.actions: actions})
126 |                 
127 |                 for i in range(0, len(batch)):
128 |                     terminal = dones[i]
129 | 
130 |                     if terminal:
131 |                         target_Qs_batch.append(rewards[i])
132 |                         
133 |                     else:
134 |                         target = rewards[i] + gamma * np.max(Qs_next_state[i])
135 |                         target_Qs_batch.append(target)
136 |                         
137 | 
138 |                 targets = np.array([each for each in target_Qs_batch])
139 |                 
140 |                 error_rate, _ = sess.run([model.error_rate, model.optimizer], 
141 |                                           feed_dict={model.input_matrix: states,
142 |                                                      model.target_Q: targets,
143 |                                                      model.actions: actions})
144 |                 '''                
145 |                 # Write TF Summaries
146 |                 summary = sess.run(write_op, feed_dict={model.inputs_: states,
147 |                                                    model.target_Q: targets,
148 |                                                    model.actions_: actions})
149 | 
150 |                 writer.add_summary(summary, episode)
151 |                 writer.flush()
152 |               
153 | 
154 |             if episode % 5 == 0:
155 |                 #saver.save(sess, filepath+'/models/model.ckpt')
156 |                 #print("Model Saved")
157 |                 '''
158 |     
159 |     plt.plot(scores)
160 |     plt.title('DQN Performance During Training')
161 |     plt.xlabel('N Episodes')
162 |     plt.ylabel('Score Value')
163 |     plt.show()
164 |     plt.waitforbuttonpress()
165 |     plt.close()
166 |     return model
167 |   
168 |     
169 | def play_doom(model, environment):
170 |     
171 |     stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4) 
172 |     scores = []
173 |     
174 |     with tf.Session() as sess:
175 |         
176 |         sess.run(tf.global_variables_initializer())
177 |         totalScore = 0
178 |         
179 |         for _ in range(100):
180 |             
181 |             done = False
182 |             environment.new_episode()
183 |             
184 |             state = environment.get_state().screen_buffer
185 |             state, stacked_frames = stack_frames(stacked_frames, state, True)
186 |                 
187 |             while not environment.is_episode_finished():
188 |                 
189 |                 Q_matrix = sess.run(model.output, feed_dict = {model.input_matrix: state.reshape((1, *state.shape))})
190 |                 choice = np.argmax(Q_matrix)
191 |                 action = possible_actions[int(choice)]
192 |                 
193 |                 environment.make_action(action)
194 |                 done = environment.is_episode_finished()
195 |                 score = environment.get_total_reward()
196 |                 scores.append(score)
197 |                 time.sleep(0.01)
198 |                 
199 |                 if done:
200 |                     break  
201 |                                         
202 |             score = environment.get_total_reward()
203 |             print("Score: ", score)
204 |             
205 |         environment.close()
206 |         
207 |     plt.plot(scores)
208 |     plt.title('DQN Performance After Training')
209 |     plt.xlabel('N Episodes')
210 |     plt.ylabel('Score Value')
211 |     plt.show()
212 |     plt.waitforbuttonpress()
213 |     plt.close()
214 |     
215 | if __name__ == '__main__':
216 |     
217 |     
218 |     model = DeepQNetwork(n_units=n_units, 
219 |                          n_classes=n_classes, 
220 |                          n_filters=n_filters, 
221 |                          stride=stride, 
222 |                          kernel=kernel, 
223 |                          state_size=state_size, 
224 |                          action_size=action_size, 
225 |                          learning_rate=learning_rate)
226 |     
227 |     trained_model = train_model(model=model,
228 |                                 environment=environment)
229 |     
230 |     play_doom(model=trained_model,
231 |               environment=environment)


--------------------------------------------------------------------------------
/chapter3/frozen_lake_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Mar 13 00:58:25 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import os, time, gym, numpy as np
 10 | 
 11 | #Parameters
 12 | learning_rate = 1e-2
 13 | gamma = 0.96
 14 | epsilon = 0.9
 15 | n_episodes = 10000
 16 | max_steps = 100
 17 | environment = gym.make('FrozenLake-v0')
 18 | Q_matrix = np.zeros((environment.observation_space.n, environment.action_space.n))
 19 | 
 20 | def choose_action(state):
 21 |     '''
 22 |     To be used after Q table has been updated, returns an action
 23 |     
 24 |     Parameters:
 25 |         
 26 |         state - int - the current state of the agent 
 27 |         
 28 |     :return: int
 29 |     '''   
 30 |     return np.argmax(Q_matrix[state, :])
 31 | 
 32 | def exploit_explore(prior_state, epsilon=epsilon, Q_matrix=Q_matrix):    
 33 |     '''
 34 |     One half of the exploit-explore paradigm that we will utilize 
 35 |     
 36 |     Parameters 
 37 |         
 38 |         prior_state - int  - the prior state of the environment at a given iteration
 39 |         epsilon - float - parameter that we use to determine whether we will try a new or current best action 
 40 |         
 41 |     :return: int
 42 |     '''
 43 |     
 44 |     if np.random.uniform(0, 1) < epsilon:
 45 |         return environment.action_space.sample()
 46 |     else:
 47 |         return np.argmax(Q_matrix[prior_state, :])
 48 |     
 49 |     
 50 | def update_q_matrix(prior_state, observation , reward, action):
 51 |     '''
 52 |     Algorithm that updates the values in the Q table to reflect knowledge acquired by the agent 
 53 |     
 54 |     Parameters 
 55 |     
 56 |         prior_state - int  - the prior state of the environment before the current timestemp
 57 |         observation - int  - the current state of the environment
 58 |         reward - int - the reward yielded from the environment after an action 
 59 |         action - int - the action suggested by the epsilon greedy algorithm 
 60 |         
 61 |     :return: None
 62 |     '''
 63 |     
 64 |     prediction = Q_matrix[prior_state, action]
 65 |     actual_label = reward + gamma * np.max(Q_matrix[observation, :])
 66 |     Q_matrix[prior_state, action] = Q_matrix[prior_state, action] + learning_rate*(actual_label - prediction)
 67 |     
 68 |     
 69 | def populate_q_matrix(render=False, n_episodes=n_episodes):
 70 |     '''
 71 |     Directly implementing Q Learning (Greedy Epsilon) on the Frozen Lake Game
 72 |     This function populations the empty Q matrix 
 73 |     Parameters 
 74 |     
 75 |         prior_state - int  - the prior state of the environment before the current timestemp
 76 |         observation - int  - the current state of the environment
 77 |         reward - int - the reward yielded from the environment after an action 
 78 |         action - int - the action suggested by the epsilon greedy algorithm 
 79 |         
 80 |     :return: None
 81 |     '''    
 82 |     
 83 |     for episode in range(n_episodes):
 84 |         prior_state = environment.reset()
 85 |         _ = 0
 86 |         
 87 |         while _ < max_steps:
 88 |             
 89 |             if render == True: environment.render()
 90 |             action = exploit_explore(prior_state)  
 91 |             observation, reward, done, info = environment.step(action)      
 92 |             
 93 |             update_q_matrix(prior_state=prior_state, 
 94 |                             observation=observation, 
 95 |                             reward=reward, 
 96 |                             action=action)
 97 |             
 98 |             prior_state = observation
 99 |             _ += 1
100 |             
101 |             if done:
102 |                 break
103 |                             
104 | 
105 | def play_frozen_lake(n_episodes):
106 |     
107 |     '''
108 |     Directly implementing Q Learning (Greedy Epsilon) on the Frozen Lake Game
109 |     This function uses the already populated Q Matrix and displays the game being used
110 |     
111 |     Parameters 
112 |     
113 |         prior_state - int  - the prior state of the environment before the current timestemp
114 |         observation - int  - the current state of the environment
115 |         reward - int - the reward yielded from the environment after an action 
116 |         action - int - the action suggested by the epsilon greedy algorithm 
117 |         
118 |     :return: None
119 |     '''        
120 |         
121 |     for episode in range(n_episodes):
122 |         print('Episode: %s'%episode+1)
123 |         prior_state = environment.reset()
124 |         done = False
125 | 
126 |         while not done: 
127 |             environment.render()
128 |             action = choose_action(prior_state)
129 |             observation, reward, done, info = environment.step(action)
130 |             prior_state = observation
131 |             if reward == 0:
132 |                 time.sleep(0.5)
133 |             else:
134 |                 print('You have won on episode %s!'%(episode+1))
135 |                 time.sleep(5)
136 |                 os.system('clear')
137 | 
138 |             if done and reward == -1:
139 |                 print('You have lost this episode... :-/')
140 |                 time.sleep(5)
141 |                 os.system('clear')
142 |                 break
143 |                 
144 |         
145 |         
146 | if __name__ == '__main__':
147 |     
148 |     
149 |     populate_q_matrix(render=False)
150 |     play_frozen_lake(n_episodes=10)
151 | 


--------------------------------------------------------------------------------
/chapter4/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/.DS_Store


--------------------------------------------------------------------------------
/chapter4/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/__init__.py


--------------------------------------------------------------------------------
/chapter4/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/__init__.pyc


--------------------------------------------------------------------------------
/chapter4/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/chapter4/__pycache__/market_making_example.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/__pycache__/market_making_example.cpython-36.pyc


--------------------------------------------------------------------------------
/chapter4/market_making_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Mar 25 15:00:05 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import random, tensorflow as tf, numpy as np, matplotlib.pyplot as plt
 10 | from tgym.envs import SpreadTrading
 11 | from tgym.gens.deterministic import WavySignal
 12 | from neural_networks.market_making_models import DeepQNetworkMM, Memory
 13 | from chapter2.cart_pole_example import calculate_discounted_reward
 14 | from neural_networks.policy_gradient_utilities import PolicyGradient
 15 | from tgym.gens.csvstream import CSVStreamer
 16 | 
 17 | #Parameters
 18 | np.random.seed(2018)
 19 | n_episodes = 1
 20 | trading_fee = .2
 21 | time_fee = 0
 22 | history_length = 2
 23 | memory_size = 2000
 24 | gamma = 0.96
 25 | epsilon_min = 0.01
 26 | batch_size = 64
 27 | action_size = len(SpreadTrading._actions)
 28 | learning_rate = 1e-2
 29 | n_layers = 4
 30 | n_units = 500
 31 | n_classes = 3
 32 | goal = 190
 33 | max_steps = 1000
 34 | explore_start = 1.0
 35 | explore_stop = 0.01
 36 | decay_rate = 1e-4
 37 | _lambda = 0.95
 38 | value_coefficient = 0.5
 39 | entropy_coefficient = 0.01
 40 | max_grad_norm = 0.5
 41 | log_interval = 10
 42 | hold =  np.array([1, 0, 0])
 43 | buy = np.array([0, 1, 0])
 44 | sell = np.array([0, 0, 1])
 45 | possible_actions = [hold, buy, sell]
 46 | 
 47 | #Classes and variables
 48 | generator = CSVStreamer(filename='/Users/tawehbeysolow/Downloads/amazon_order_book_data2.csv')
 49 | #generator = WavySignal(period_1=25, period_2=50, epsilon=-0.5)
 50 | 
 51 | memory = Memory(max_size=memory_size)
 52 | 
 53 | environment = SpreadTrading(spread_coefficients=[1],
 54 |                             data_generator=generator,
 55 |                             trading_fee=trading_fee,
 56 |                             time_fee=time_fee,
 57 |                             history_length=history_length)
 58 | 
 59 | state_size = len(environment.reset())
 60 | 
 61 | 
 62 | def baseline_model(n_actions, info, random=False):
 63 |     
 64 |     if random == True:
 65 |         action = np.random.choice(range(n_actions), p=np.repeat(1/float(n_actions), 3))
 66 |         action = possible_actions[action]
 67 | 
 68 |     else:
 69 |         
 70 |         if len(info) == 0:
 71 |             action = np.random.choice(range(n_actions), p=np.repeat(1/float(n_actions), 3))
 72 |             action = possible_actions[action]
 73 |         
 74 |         elif info['action'] == 'sell':
 75 |             action = buy
 76 |         
 77 |         else:   
 78 |             action = sell
 79 |             
 80 |     return action
 81 |         
 82 | 
 83 | def score_model(model, n_tests):
 84 |     scores = []    
 85 |     for _ in range(n_tests):
 86 |         environment.reset()
 87 |         observation = environment.reset()
 88 |         reward_sum = 0
 89 |         while True:
 90 |             ''
 91 |             #environment.render()
 92 |                 
 93 |             predict = model.predict([observation.reshape(1, 8)])[0]
 94 |             action = possible_actions[np.argmax(predict)]
 95 |             observation, reward, done, _ = environment.step(action)
 96 |             reward_sum += reward
 97 |             if done:
 98 |                 break
 99 |         scores.append(reward_sum)
100 |         
101 |     return np.mean(scores)
102 | 
103 | 
104 | def exploit_explore(session, model, explore_start, explore_stop, decay_rate, decay_step, state, actions):
105 |     exp_exp_tradeoff = np.random.rand()
106 |     explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
107 |     
108 |     if (explore_probability > exp_exp_tradeoff):
109 |         action = random.choice(possible_actions)
110 |         
111 |     else:                
112 |         Qs = session.run(model.output_layer, feed_dict = {model.input_matrix: state.reshape((1, 8))})
113 |         choice = np.argmax(Qs)
114 |         action = possible_actions[int(choice)]
115 |                 
116 |     return action, explore_probability
117 | 
118 | 
119 | def train_model(environment, dql=None, pg=None, baseline=None):
120 |     scores = []
121 |     done = False
122 |     error_rate, step = 0, 0
123 |     info = {}
124 |     n_episode, reward_sum, score, episode_done = 0, 0, 0, False
125 |     n_actions = len(SpreadTrading._actions)
126 |     observation = environment.reset()
127 |     states = np.empty(0).reshape(0, state_size)
128 |     actions = np.empty(0).reshape(0, len(SpreadTrading._actions))
129 |     rewards = np.empty(0).reshape(0, 1)
130 |     discounted_rewards = np.empty(0).reshape(0, 1)
131 |     observation = environment.reset()
132 |     
133 |     if baseline == True:
134 |         
135 |         
136 |         for episode in range(n_episodes):
137 |                                 
138 |             for _ in range(100):
139 |                 action = baseline_model(n_actions=n_actions,
140 |                                         info=info)
141 |                 
142 |                 state, reward, done, info = environment.step(action)
143 |                 reward_sum += reward
144 |                             
145 |                 next_state = np.zeros((state_size,), dtype=np.int)
146 |                 step = max_steps                                    
147 |                 scores.append(reward_sum)                    
148 |                 memory.add((state, action, reward, next_state, done))
149 |            
150 |             print('Episode: {}'.format(episode),
151 |                      'Total reward: {}'.format(reward_sum))
152 |                 
153 |             reward_sum = 0
154 |             
155 |         environment.reset()
156 |             
157 |         print(np.mean(scores))
158 |         plt.hist(scores)
159 |         plt.xlabel('Distribution of Scores')
160 |         plt.ylabel('Relative Frequency')
161 |         plt.show()
162 |         plt.waitforbuttonpress()
163 |         plt.close()
164 |                 
165 |     
166 |     elif dql == True:
167 |         
168 |         loss = []
169 |         
170 |         model = DeepQNetworkMM(n_units=n_units, 
171 |                                n_classes=n_classes, 
172 |                                state_size=state_size, 
173 |                                action_size=action_size, 
174 |                                learning_rate=learning_rate)
175 | 
176 |         #tf.summary.scalar('Loss', model.error_rate)
177 |         
178 | 
179 |         with tf.Session() as sess:
180 |             
181 |             sess.run(tf.global_variables_initializer())
182 |             decay_step = 0
183 |     
184 |             for episode in range(n_episodes):
185 |                 
186 |                 current_step, reward_sum = 0, []
187 |                 state = np.reshape(observation, [1, state_size])    
188 |     
189 |                 while current_step < max_steps:
190 |                     
191 |                     current_step += 1; decay_step += 1
192 |                     
193 |                     action, explore_probability = exploit_explore(session=sess,
194 |                                                                   model=model,
195 |                                                                   explore_start=explore_start, 
196 |                                                                   explore_stop=explore_stop, 
197 |                                                                   decay_rate=decay_rate, 
198 |                                                                   decay_step=decay_step, 
199 |                                                                   state=state, 
200 |                                                                   actions=possible_actions)
201 |                     
202 |                     state, reward, done, info = environment.step(action)
203 |                     reward_sum.append(reward)
204 |                     
205 |                     if current_step >= max_steps:
206 |                         done = True
207 |                                                 
208 |                     if done == True:
209 |                         
210 |                         next_state = np.zeros((state_size,), dtype=np.int)
211 |                         step = max_steps                    
212 |                         total_reward = np.sum(reward_sum)                    
213 |                         scores.append(total_reward)                    
214 |                         memory.add((state, action, reward, next_state, done))
215 |                        
216 |                         print('Episode: {}'.format(episode),
217 |                                   'Total reward: {}'.format(total_reward),
218 |                                   'Loss: {}'.format(error_rate),
219 |                                   'Explore P: {:.4f}'.format(explore_probability))
220 |                         
221 |                         loss.append(error_rate)
222 |     
223 |                     elif done != True:
224 |                         
225 |                         next_state = environment.reset()
226 |                         state = next_state
227 |                         memory.add((state, action, reward, next_state, done))
228 |     
229 |                     batch = memory.sample(batch_size)
230 |                     states = np.array([each[0] for each in batch])
231 |                     actions = np.array([each[1] for each in batch])
232 |                     rewards = np.array([each[2] for each in batch]) 
233 |                     next_states = np.array([each[3] for each in batch])
234 |                     dones = np.array([each[4] for each in batch])
235 |     
236 |                     target_Qs_batch = []
237 |                     
238 |                     Qs_next_state = sess.run(model.predicted_Q, feed_dict={model.input_matrix: next_states, model.actions: actions})
239 |                     
240 |                     for i in range(0, len(batch)):
241 |                         terminal = dones[i]
242 |     
243 |                         if terminal:
244 |                             target_Qs_batch.append(rewards[i])
245 |                             
246 |                         else:
247 |                             target = rewards[i] + gamma * np.max(Qs_next_state[i])
248 |                             target_Qs_batch.append(target)
249 |                             
250 |                         
251 |                     targets = np.array([each for each in target_Qs_batch])
252 |     
253 |                     error_rate, _ = sess.run([model.error_rate, model.optimizer], 
254 |                                               feed_dict={model.input_matrix: states,
255 |                                                          model.target_Q: targets,
256 |                                                          model.actions: actions})
257 |             if episode == n_episodes - 1:
258 |                 
259 |                 market_making(model=model,
260 |                               environment=environment,
261 |                               sess=sess,
262 |                               state=state,
263 |                               dpl=True)
264 |     
265 |     elif pg == True:
266 |         
267 |         loss = []
268 |             
269 |         mlp_model = PolicyGradient(n_units=n_units, 
270 |                                   n_layers=n_layers, 
271 |                                   n_columns=8, 
272 |                                   n_outputs=n_classes, 
273 |                                   learning_rate=learning_rate, 
274 |                                   hidden_activation='selu', 
275 |                                   output_activation='softmax',
276 |                                   loss_function='categorical_crossentropy')
277 |         
278 |         policy_model, model_predictions = mlp_model.create_policy_model(input_shape=(len(observation), ))
279 |         
280 |         policy_model.summary()   
281 |        
282 |         while n_episode < n_episodes: 
283 |          
284 |             state = observation.reshape(1, 8)    
285 |             prediction = model_predictions.predict([state])[0]
286 |             action = np.random.choice(range(len(SpreadTrading._actions)), p=prediction)
287 |             action = possible_actions[action]
288 |             states = np.vstack([states, state])
289 |             actions = np.vstack([actions, action])
290 |             
291 |             observation, reward, episode_done, info = environment.step(action)
292 |             reward_sum += reward
293 |             rewards = np.vstack([rewards, reward])
294 |             step += 1
295 |             
296 |             if step == max_steps: 
297 |                 episode_done = True
298 |     
299 |             if episode_done == True:
300 |                 
301 |                 discounted_reward = calculate_discounted_reward(rewards, gamma=gamma)
302 |                 discounted_rewards = np.vstack([discounted_rewards, discounted_reward])
303 |                 
304 |                 discounted_rewards -= discounted_rewards.mean()
305 |                 discounted_rewards /= discounted_rewards.std()
306 |                 discounted_rewards = discounted_rewards.squeeze()
307 |                 actions = actions.squeeze().astype(int)
308 |                 
309 |                 #train_actions = np.zeros([len(actions), n_actions])
310 |                 #train_actions[np.arange(len(actions)), actions] = 1
311 |                 
312 |                 error = policy_model.train_on_batch([states, discounted_rewards], actions)
313 |                 loss.append(error)
314 |                 
315 |                 states = np.empty(0).reshape(0, 8)
316 |                 actions = np.empty(0).reshape(0, 3)
317 |                 rewards = np.empty(0).reshape(0, 1)
318 |                 discounted_rewards = np.empty(0).reshape(0, 1)
319 |                                 
320 |                 score = score_model(model=model_predictions, n_tests=10)
321 |                 
322 |                 print('''\nEpisode: %s \nAverage Reward: %s  \nScore: %s \nError: %s'''
323 |                       )%(n_episode+1, reward_sum/float(batch_size), score, np.mean(loss[-batch_size:]))
324 |     
325 |                 if score >= goal: 
326 |                     break 
327 |                 
328 |                 reward_sum = 0
329 |                     
330 |                 n_episode += 1
331 |                 observation = environment.reset()
332 |                 
333 |             if n_episode == n_episodes - 1:
334 |                 
335 |                 market_making(model=model_predictions,
336 |                               environment=environment,
337 |                               sess=None,
338 |                               state=state,
339 |                               pg=True)
340 |              
341 |     if baseline != True:
342 |         
343 |         plt.title('Policy Gradient Error plot over %s Episodes'%(n_episode+1))
344 |         plt.xlabel('N batches')
345 |         plt.ylabel('Error Rate')
346 |         plt.plot(loss)
347 |         plt.show()
348 |         plt.waitforbuttonpress()
349 |         return model
350 |         
351 | def market_making(model, environment, sess, state, dpl=None, pg=None):    
352 |     
353 |     scores = []
354 |     total_reward = 0
355 |     environment.reset()
356 |     
357 |     for _ in range(1000):
358 |                 
359 |         for __ in range(100):
360 |             
361 |             state = np.reshape(state, [1, state_size])  
362 |             
363 |             if dpl == True:
364 |                 Q_matrix = sess.run(model.output_layer, feed_dict = {model.input_matrix: state.reshape((1, 8))})
365 |                 choice = np.argmax(Q_matrix)
366 |                 action = possible_actions[int(choice)]
367 | 
368 |             elif pg == True:
369 |                 state = np.reshape(state, [1, 8])
370 |                 predict = model.predict([state])[0]
371 |                 action = np.argmax(predict)
372 |                 action = possible_actions[int(action)]
373 |                                 
374 |             state, reward, done, info = environment.step(action)
375 |             total_reward += reward
376 | 
377 |                 
378 |         print('Episode: {}'.format(_),
379 |               'Total reward: {}'.format(total_reward))
380 |         scores.append(total_reward)
381 |         state = environment.reset()
382 | 
383 |     print(np.mean(scores))
384 |     plt.hist(scores)
385 |     plt.xlabel('Distribution of Scores')
386 |     plt.ylabel('Relative Frequency')
387 |     plt.show()
388 |     plt.waitforbuttonpress()
389 |     plt.close()
390 | 
391 |     
392 | if __name__ == '__main__':
393 |     
394 |     
395 |     train_model(environment=environment, dql=True)
396 |     
397 |         
398 |     
399 |     
400 |     
401 | 
402 | 


--------------------------------------------------------------------------------
/chapter5/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter5/.DS_Store


--------------------------------------------------------------------------------
/chapter5/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter5/__init__.py


--------------------------------------------------------------------------------
/chapter5/create_environment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri May 10 10:44:23 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import cv2, gym, numpy as np
 10 | from retro_contest.local import make
 11 | from retro import make as make_retro
 12 | from baselines.common.atari_wrappers import FrameStack
 13 | 
 14 | cv2.ocl.setUseOpenCL(False)
 15 | 
 16 | class PreprocessFrame(gym.ObservationWrapper):
 17 |     """
 18 |     Grayscaling image from three dimensional RGB pixelated images
 19 |     - Set frame to gray
 20 |     - Resize the frame to 96x96x1
 21 |     """
 22 |     def __init__(self, environment, width, height):
 23 |         gym.ObservationWrapper.__init__(self, environment)
 24 |         self.width = width
 25 |         self.height = height
 26 |         self.observation_space = gym.spaces.Box(low=0, 
 27 |                                                 high=255,
 28 |                                                 shape=(self.height, self.width, 1), 
 29 |                                                 dtype=np.uint8)
 30 | 
 31 |     def observation(self, image):
 32 |         image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
 33 |         image = cv2.resize(image, (self.width, self.height), interpolation=cv2.INTER_AREA)
 34 |         image = image[:, :, None]
 35 |         return image
 36 | 
 37 | 
 38 | class ActionsDiscretizer(gym.ActionWrapper):
 39 |     """
 40 |     Wrap a gym-retro environment and make it use discrete
 41 |     actions for the Sonic game.
 42 |     """
 43 |     def __init__(self, env):
 44 |         super(ActionsDiscretizer, self).__init__(env)
 45 |         buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
 46 |         actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
 47 |                    ['DOWN', 'B'], ['B']]
 48 |         self._actions = []
 49 | 
 50 |         """
 51 |         What we do in this loop:
 52 |         For each action in actions
 53 |             - Create an array of 12 False (12 = nb of buttons)
 54 |             For each button in action: (for instance ['LEFT']) we need to make that left button index = True
 55 |                 - Then the button index = LEFT = True
 56 |             In fact at the end we will have an array where each array is an action and each elements True of this array
 57 |             are the buttons clicked.
 58 |         """
 59 |         for action in actions:
 60 |             _actions = np.array([False] * len(buttons))
 61 |             for button in action:
 62 |                 _actions[buttons.index(button)] = True
 63 |             self._actions.append(_actions)
 64 |         self.action_space = gym.spaces.Discrete(len(self._actions))
 65 | 
 66 |     def action(self, a): 
 67 |         return self._actions[a].copy()
 68 | 
 69 | class RewardScaler(gym.RewardWrapper):
 70 |     """
 71 |     Bring rewards to a reasonable scale for PPO.
 72 |     This is incredibly important and effects performance
 73 |     drastically.
 74 |     """
 75 |     def reward(self, reward):
 76 | 
 77 |         return reward * 0.01
 78 | 
 79 | class AllowBacktracking(gym.Wrapper):
 80 |     """
 81 |     Use deltas in max(X) as the reward, rather than deltas
 82 |     in X. This way, agents are not discouraged too heavily
 83 |     from exploring backwards if there is no way to advance
 84 |     head-on in the level.
 85 |     """
 86 |     def __init__(self, environment):
 87 |         super(AllowBacktracking, self).__init__(environment)
 88 |         self.curent_reward = 0
 89 |         self.max_reward = 0
 90 | 
 91 |     def reset(self, **kwargs):
 92 |         self.current_reward = 0
 93 |         self.max_reward = 0
 94 |         return self.env.reset(**kwargs)
 95 | 
 96 |     def step(self, action):
 97 |         observation, reward, done, info = self.environment.step(action)
 98 |         self.current_reward += reward
 99 |         reward = max(0, self.current_reward - self.max_reward)
100 |         self.max_reward = max(self.max_reward, self.current_reward)
101 |         return observation, reward, done, info
102 | 
103 | def wrap_environment(environment, n_frames=4):
104 |     environment = ActionsDiscretizer(environment)
105 |     environment = RewardScaler(environment)
106 |     environment = PreprocessFrame(environment)
107 |     environment = FrameStack(environment, n_frames)
108 |     environment = AllowBacktracking(environment)
109 |     return environment
110 | 
111 | def create_new_environment(environment_index, n_frames=4):
112 |     """
113 |     Create an environment with some standard wrappers.
114 |     """
115 | 
116 |     dictionary = [
117 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'SpringYardZone.Act3'},
118 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'SpringYardZone.Act2'},
119 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'GreenHillZone.Act3'},
120 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'GreenHillZone.Act1'},
121 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'StarLightZone.Act2'},
122 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'StarLightZone.Act1'},
123 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'MarbleZone.Act2'},
124 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'MarbleZone.Act1'},
125 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'MarbleZone.Act3'},
126 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'ScrapBrainZone.Act2'},
127 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'LabyrinthZone.Act2'},
128 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'LabyrinthZone.Act1'},
129 |         {'game': 'SonicTheHedgehog-Genesis', 'state': 'LabyrinthZone.Act3'}]
130 |     
131 |     print(dictionary[environment_index]['game'])
132 |     print(dictionary[environment_index]['state'])
133 |     
134 |     environment = make(game=dictionary[environment_index]['game'], 
135 |                        state=dictionary[environment_index]['state'],
136 |                        bk2dir="./records")
137 | 
138 |     environment = wrap_environment(environment=environment,
139 |                                    n_frames=n_frames)
140 |     
141 |     return environment
142 | 
143 | 
144 | def make_test_level_Green():
145 |     return make_test()
146 | 
147 | 
148 | def make_test(n_frames=4):
149 |     """
150 |     Create an environment with some standard wrappers.
151 |     """
152 | 
153 |     environment = make_retro(game='SonicTheHedgehog-Genesis', 
154 |                              state='GreenHillZone.Act2', 
155 |                              record="./records")
156 | 
157 |     environment = wrap_environment(environment=environment,
158 |                                    n_frames=n_frames)
159 | 
160 |     return environment
161 | 
162 | 


--------------------------------------------------------------------------------
/chapter5/sonic_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun May 12 06:18:09 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | from algorithms.actor_critic_utilities import Model
 10 | from chapter5.create_environment import create_new_environment
 11 | 
 12 | class Worker():
 13 |     def __init__(self,game,name,s_size,a_size,trainer,model_path,global_episodes):
 14 |         self.name = "worker_" + str(name)
 15 |         self.number = name        
 16 |         self.model_path = model_path
 17 |         self.trainer = trainer
 18 |         self.global_episodes = global_episodes
 19 |         self.increment = self.global_episodes.assign_add(1)
 20 |         self.episode_rewards = []
 21 |         self.episode_lengths = []
 22 |         self.episode_mean_values = []
 23 |         self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))
 24 | 
 25 |         #Create the local copy of the network and the tensorflow op to copy global paramters to local network
 26 |         self.local_AC = AC_Network(s_size,a_size,self.name,trainer)
 27 |         self.update_local_ops = update_target_graph('global',self.name)        
 28 | 
 29 |     def train(self,rollout,sess,gamma,bootstrap_value):
 30 |         rollout = np.array(rollout)
 31 |         observations = rollout[:,0]
 32 |         actions = rollout[:,1]
 33 |         rewards = rollout[:,2]
 34 |         next_observations = rollout[:,3]
 35 |         values = rollout[:,5]
 36 |         
 37 |         # Here we take the rewards and values from the rollout, and use them to 
 38 |         # generate the advantage and discounted returns. 
 39 |         # The advantage function uses "Generalized Advantage Estimation"
 40 |         self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
 41 |         discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
 42 |         self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
 43 |         advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
 44 |         advantages = discount(advantages,gamma)
 45 | 
 46 |         # Update the global network using gradients from loss
 47 |         # Generate network statistics to periodically save
 48 |         feed_dict = {self.local_AC.target_v:discounted_rewards,
 49 |             self.local_AC.inputs:np.vstack(observations),
 50 |             self.local_AC.actions:actions,
 51 |             self.local_AC.advantages:advantages,
 52 |             self.local_AC.state_in[0]:self.batch_rnn_state[0],
 53 |             self.local_AC.state_in[1]:self.batch_rnn_state[1]}
 54 |         
 55 |         v_l,p_l,e_l,g_n,v_n, self.batch_rnn_state,_ = sess.run([self.local_AC.value_loss,
 56 |             self.local_AC.policy_loss,
 57 |             self.local_AC.entropy,
 58 |             self.local_AC.grad_norms,
 59 |             self.local_AC.var_norms,
 60 |             self.local_AC.state_out,
 61 |             self.local_AC.apply_grads],
 62 |             feed_dict=feed_dict)
 63 |         
 64 |         return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
 65 |         
 66 |     def work(self,max_episode_length,gamma,sess,coord,saver):
 67 |         episode_count = sess.run(self.global_episodes)
 68 |         total_steps = 0
 69 |         print ("Starting worker " + str(self.number))
 70 |         with sess.as_default(), sess.graph.as_default():                 
 71 |             while not coord.should_stop():
 72 |                 sess.run(self.update_local_ops)
 73 |                 episode_buffer = []
 74 |                 episode_values = []
 75 |                 episode_frames = []
 76 |                 episode_reward = 0
 77 |                 episode_step_count = 0
 78 |                 d = False
 79 |                 
 80 |                 self.env.new_episode()
 81 |                 prior_state = self.env.get_state().screen_buffer
 82 |                 episode_frames.append(prior_state)
 83 |                 prior_state = process_frame(prior_state)
 84 |                 rnn_state = self.local_AC.state_init
 85 |                 self.batch_rnn_state = rnn_state
 86 |                 while self.env.is_episode_finished() == False:
 87 |                     #Take an action using probabilities from policy network output.
 88 |                     action_dist, value_function, rnn_state = sess.run([self.local_AC.policy, self.local_AC.value,self.local_AC.state_out], 
 89 |                                                     feed_dict={self.local_AC.inputs:[prior_state],
 90 |                                                     self.local_AC.state_in[0]:rnn_state[0],
 91 |                                                     self.local_AC.state_in[1]:rnn_state[1]})
 92 |             
 93 |                     action = np.random.choice(action_dist[0], p=action_dist[0])
 94 |                     action = np.argmax(action_dist == action)
 95 | 
 96 |                     reward = self.env.make_action(self.actions[action]) / 100.0
 97 |                     done = self.env.is_episode_finished()
 98 |                     if done == False:
 99 |                         current_state = self.env.get_state().screen_buffer
100 |                         episode_frames.append(current_state)
101 |                         prior_state = process_frame(current_state)
102 |                     else:
103 |                         current_state = prior_state
104 |                         
105 |                     episode_buffer.append([prior_state, action, reward, current_state, done, value[0,0]])
106 |                     episode_values.append(value[0,0])
107 | 
108 |                     episode_reward += r
109 |                     s = s1                    
110 |                     total_steps += 1
111 |                     episode_step_count += 1
112 |                     
113 |                     # If the episode hasn't ended, but the experience buffer is full, then we
114 |                     # make an update step using that experience rollout.
115 |                     if len(episode_buffer) == 30 and d != True and episode_step_count != max_episode_length - 1:
116 |                         # Since we don't know what the true final return is, we "bootstrap" from our current
117 |                         # value estimation.
118 |                         v1 = sess.run(self.local_AC.value, 
119 |                             feed_dict={self.local_AC.inputs:[s],
120 |                             self.local_AC.state_in[0]:rnn_state[0],
121 |                             self.local_AC.state_in[1]:rnn_state[1]})[0,0]
122 |                         v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1)
123 |                         episode_buffer = []
124 |                         sess.run(self.update_local_ops)
125 |                     if d == True:
126 |                         break
127 |                                             
128 |                 self.episode_rewards.append(episode_reward)
129 |                 self.episode_lengths.append(episode_step_count)
130 |                 self.episode_mean_values.append(np.mean(episode_values))
131 |                 
132 |                 # Update the network using the episode buffer at the end of the episode.
133 |                 if len(episode_buffer) != 0:
134 |                     v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
135 |                                 
136 |                     
137 |                 # Periodically save gifs of episodes, model parameters, and summary statistics.
138 |                 if episode_count % 5 == 0 and episode_count != 0:
139 |                     if self.name == 'worker_0' and episode_count % 25 == 0:
140 |                         time_per_step = 0.05
141 |                         images = np.array(episode_frames)
142 |                         make_gif(images,'./frames/image'+str(episode_count)+'.gif',
143 |                             duration=len(images)*time_per_step,true_image=True,salience=False)
144 |                     if episode_count % 250 == 0 and self.name == 'worker_0':
145 |                         saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
146 |                         print ("Saved Model")
147 | 
148 |                     mean_reward = np.mean(self.episode_rewards[-5:])
149 |                     mean_length = np.mean(self.episode_lengths[-5:])
150 |                     mean_value = np.mean(self.episode_mean_values[-5:])
151 |                     summary = tf.Summary()
152 |                     summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
153 |                     summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
154 |                     summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
155 |                     summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
156 |                     summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
157 |                     summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
158 |                     summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
159 |                     summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
160 |                     self.summary_writer.add_summary(summary, episode_count)
161 | 
162 |                     self.summary_writer.flush()
163 |                 if self.name == 'worker_0':
164 |                     sess.run(self.increment)
165 |                 episode_count += 1
166 | 
167 | def play_sonic(policy, environment_index):
168 |     
169 |     
170 |     environment = create_new_environment(environment_index=environment_index)
171 |     observation = environment.observation_space
172 |     actions = environment.action_space
173 |     
174 |     
175 |     model = Model(policy=policy,
176 |                   ob_space=observation,
177 |                   action_space=actions,
178 |                   n_environments=1,
179 |                   n_steps=1,
180 |                   entropy_coefficient=0,
181 |                   value_coefficient=0,
182 |                   max_grad_norm=0)    
183 | 
184 |     observation = environment.reset()
185 |     score = 0
186 |     boom = 0
187 |     done = False
188 |     
189 |     with tf.device("/cpu:0"): 
190 |         master_network = AC_Network(s_size,a_size,'global',None) # Generate global network
191 |         num_workers = multiprocessing.cpu_count() # Set workers ot number of available CPU threads
192 |         workers = []
193 |         # Create worker classes
194 |         for i in range(num_workers):
195 |             
196 |             workers.append(Worker(environment=environment,
197 |                                   name=i,
198 |                                   s_size=s_size,
199 |                                   a_sizse=a_size,
200 |                                   trainer=trainer,
201 |                                   saver=saver,
202 |                                   model_path))
203 |     
204 |         with tf.Session() as sess:
205 |             
206 |             coord = tf.train.Coordinator()
207 |             if load_model == True:
208 |                 print 'Loading Model...'
209 |                 ckpt = tf.train.get_checkpoint_state(model_path)
210 |                 saver.restore(sess,ckpt.model_checkpoint_path)
211 |             else:
212 |                 sess.run(tf.global_variables_initializer())
213 |                 
214 |             # This is where the asynchronous magic happens.
215 |             # Start the "work" process for each worker in a separate threat.
216 |             worker_threads = []
217 |             for worker in workers:
218 |                 worker_work = lambda: worker.work(max_episode_length=max_episode_length,
219 |                                                   gamma=gamma,
220 |                                                   master_network=master_network,
221 |                                                   sess=sess,
222 |                                                   coord=coord)
223 |                 
224 |                 
225 |                 t = threading.Thread(target=(worker_work))
226 |                 t.start()
227 |                 worker_threads.append(t)
228 |             coord.join(worker_threads)
229 |             
230 |     while done == False:
231 |         
232 |         actions, values = model.step(observation)
233 |         observation, rewards, done, _ = environment.step(actions)
234 |         score += rewards
235 |         environment.render()
236 |         boom +=1
237 | 
238 |             
239 |     print("Score ", score)
240 |     environment.close()
241 | 


--------------------------------------------------------------------------------
/errata.md:
--------------------------------------------------------------------------------
 1 | # Errata for *Book Title*
 2 | 
 3 | On **page xx** [Summary of error]:
 4 |  
 5 | Details of error here. Highlight key pieces in **bold**.
 6 | 
 7 | ***
 8 | 
 9 | On **page xx** [Summary of error]:
10 |  
11 | Details of error here. Highlight key pieces in **bold**.
12 | 
13 | ***


--------------------------------------------------------------------------------
/neural_networks/Figure_1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/Figure_1-1.png


--------------------------------------------------------------------------------
/neural_networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/__init__.py


--------------------------------------------------------------------------------
/neural_networks/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/__init__.pyc


--------------------------------------------------------------------------------
/neural_networks/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/neural_networks/__pycache__/models.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/__pycache__/models.cpython-36.pyc


--------------------------------------------------------------------------------
/neural_networks/gym_utilities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Apr  1 00:30:39 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import tensorflow as tf
 10 | import numpy as np
 11 | import baselines.common.tf_util as U
 12 | from baselines.a2c.utils import fc
 13 | from tensorflow.python.ops import math_ops
 14 | 
 15 | class Pd(object):
 16 |     """
 17 |     A particular probability distribution
 18 |     """
 19 |     def flatparam(self):
 20 |         raise NotImplementedError
 21 |     def mode(self):
 22 |         raise NotImplementedError
 23 |     def neglogp(self, x):
 24 |         # Usually it's easier to define the negative logprob
 25 |         raise NotImplementedError
 26 |     def kl(self, other):
 27 |         raise NotImplementedError
 28 |     def entropy(self):
 29 |         raise NotImplementedError
 30 |     def sample(self):
 31 |         raise NotImplementedError
 32 |     def logp(self, x):
 33 |         return - self.neglogp(x)
 34 |     def get_shape(self):
 35 |         return self.flatparam().shape
 36 |     @property
 37 |     def shape(self):
 38 |         return self.get_shape()
 39 |     def __getitem__(self, idx):
 40 |         return self.__class__(self.flatparam()[idx])
 41 | 
 42 | class PdType(object):
 43 |     """
 44 |     Parametrized family of probability distributions
 45 |     """
 46 |     def pdclass(self):
 47 |         raise NotImplementedError
 48 |     def pdfromflat(self, flat):
 49 |         return self.pdclass()(flat)
 50 |     def pdfromlatent(self, latent_vector, init_scale, init_bias):
 51 |         raise NotImplementedError
 52 |     def param_shape(self):
 53 |         raise NotImplementedError
 54 |     def sample_shape(self):
 55 |         raise NotImplementedError
 56 |     def sample_dtype(self):
 57 |         raise NotImplementedError
 58 | 
 59 |     def param_placeholder(self, prepend_shape, name=None):
 60 |         return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
 61 |     def sample_placeholder(self, prepend_shape, name=None):
 62 |         return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
 63 | 
 64 |     def __eq__(self, other):
 65 |         return (type(self) == type(other)) and (self.__dict__ == other.__dict__)
 66 | 
 67 | class CategoricalPdType(PdType):
 68 |     def __init__(self, ncat):
 69 |         self.ncat = ncat
 70 |     def pdclass(self):
 71 |         return CategoricalPd
 72 |     def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
 73 |         pdparam = _matching_fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
 74 |         return self.pdfromflat(pdparam), pdparam
 75 | 
 76 |     def param_shape(self):
 77 |         return [self.ncat]
 78 |     def sample_shape(self):
 79 |         return []
 80 |     def sample_dtype(self):
 81 |         return tf.int32
 82 | 
 83 | 
 84 | class MultiCategoricalPdType(PdType):
 85 |     def __init__(self, nvec):
 86 |         self.ncats = nvec.astype('int32')
 87 |         assert (self.ncats > 0).all()
 88 |     def pdclass(self):
 89 |         return MultiCategoricalPd
 90 |     def pdfromflat(self, flat):
 91 |         return MultiCategoricalPd(self.ncats, flat)
 92 | 
 93 |     def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0):
 94 |         pdparam = _matching_fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias)
 95 |         return self.pdfromflat(pdparam), pdparam
 96 | 
 97 |     def param_shape(self):
 98 |         return [sum(self.ncats)]
 99 |     def sample_shape(self):
100 |         return [len(self.ncats)]
101 |     def sample_dtype(self):
102 |         return tf.int32
103 | 
104 | class DiagGaussianPdType(PdType):
105 |     def __init__(self, size):
106 |         self.size = size
107 |     def pdclass(self):
108 |         return DiagGaussianPd
109 | 
110 |     def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
111 |         mean = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
112 |         logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
113 |         pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
114 |         return self.pdfromflat(pdparam), mean
115 | 
116 |     def param_shape(self):
117 |         return [2*self.size]
118 |     def sample_shape(self):
119 |         return [self.size]
120 |     def sample_dtype(self):
121 |         return tf.float32
122 | 
123 | class BernoulliPdType(PdType):
124 |     def __init__(self, size):
125 |         self.size = size
126 |     def pdclass(self):
127 |         return BernoulliPd
128 |     def param_shape(self):
129 |         return [self.size]
130 |     def sample_shape(self):
131 |         return [self.size]
132 |     def sample_dtype(self):
133 |         return tf.int32
134 |     def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
135 |         pdparam = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
136 |         return self.pdfromflat(pdparam), pdparam
137 | 
138 | # WRONG SECOND DERIVATIVES
139 | # class CategoricalPd(Pd):
140 | #     def __init__(self, logits):
141 | #         self.logits = logits
142 | #         self.ps = tf.nn.softmax(logits)
143 | #     @classmethod
144 | #     def fromflat(cls, flat):
145 | #         return cls(flat)
146 | #     def flatparam(self):
147 | #         return self.logits
148 | #     def mode(self):
149 | #         return U.argmax(self.logits, axis=-1)
150 | #     def logp(self, x):
151 | #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
152 | #     def kl(self, other):
153 | #         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
154 | #                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
155 | #     def entropy(self):
156 | #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
157 | #     def sample(self):
158 | #         u = tf.random_uniform(tf.shape(self.logits))
159 | #         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
160 | 
161 | class CategoricalPd(Pd):
162 |     def __init__(self, logits):
163 |         self.logits = logits
164 |     def flatparam(self):
165 |         return self.logits
166 |     def mode(self):
167 |         return tf.argmax(self.logits, axis=-1)
168 | 
169 |     @property
170 |     def mean(self):
171 |         return tf.nn.softmax(self.logits)
172 |     def neglogp(self, x):
173 |         # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
174 |         # Note: we can't use sparse_softmax_cross_entropy_with_logits because
175 |         #       the implementation does not allow second-order derivatives...
176 |         if x.dtype in {tf.uint8, tf.int32, tf.int64}:
177 |             # one-hot encoding
178 |             x_shape_list = x.shape.as_list()
179 |             logits_shape_list = self.logits.get_shape().as_list()[:-1]
180 |             for xs, ls in zip(x_shape_list, logits_shape_list):
181 |                 if xs is not None and ls is not None:
182 |                     assert xs == ls, 'shape mismatch: {} in x vs {} in logits'.format(xs, ls)
183 | 
184 |             x = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
185 |         else:
186 |             # already encoded
187 |             assert x.shape.as_list() == self.logits.shape.as_list()
188 | 
189 |         return tf.nn.softmax_cross_entropy_with_logits_v2(
190 |             logits=self.logits,
191 |             labels=x)
192 |     def kl(self, other):
193 |         a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
194 |         a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True)
195 |         ea0 = tf.exp(a0)
196 |         ea1 = tf.exp(a1)
197 |         z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
198 |         z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True)
199 |         p0 = ea0 / z0
200 |         return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
201 |     def entropy(self):
202 |         a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
203 |         ea0 = tf.exp(a0)
204 |         z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
205 |         p0 = ea0 / z0
206 |         return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
207 |     def sample(self):
208 |         u = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype)
209 |         return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
210 |     @classmethod
211 |     def fromflat(cls, flat):
212 |         return cls(flat)
213 | 
214 | class MultiCategoricalPd(Pd):
215 |     def __init__(self, nvec, flat):
216 |         self.flat = flat
217 |         self.categoricals = list(map(CategoricalPd,
218 |             tf.split(flat, np.array(nvec, dtype=np.int32), axis=-1)))
219 |     def flatparam(self):
220 |         return self.flat
221 |     def mode(self):
222 |         return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
223 |     def neglogp(self, x):
224 |         return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
225 |     def kl(self, other):
226 |         return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
227 |     def entropy(self):
228 |         return tf.add_n([p.entropy() for p in self.categoricals])
229 |     def sample(self):
230 |         return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
231 |     @classmethod
232 |     def fromflat(cls, flat):
233 |         raise NotImplementedError
234 | 
235 | class DiagGaussianPd(Pd):
236 |     def __init__(self, flat):
237 |         self.flat = flat
238 |         mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
239 |         self.mean = mean
240 |         self.logstd = logstd
241 |         self.std = tf.exp(logstd)
242 |     def flatparam(self):
243 |         return self.flat
244 |     def mode(self):
245 |         return self.mean
246 |     def neglogp(self, x):
247 |         return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
248 |                + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
249 |                + tf.reduce_sum(self.logstd, axis=-1)
250 |     def kl(self, other):
251 |         assert isinstance(other, DiagGaussianPd)
252 |         return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
253 |     def entropy(self):
254 |         return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
255 |     def sample(self):
256 |         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
257 |     @classmethod
258 |     def fromflat(cls, flat):
259 |         return cls(flat)
260 | 
261 | 
262 | class BernoulliPd(Pd):
263 |     def __init__(self, logits):
264 |         self.logits = logits
265 |         self.ps = tf.sigmoid(logits)
266 |     def flatparam(self):
267 |         return self.logits
268 |     @property
269 |     def mean(self):
270 |         return self.ps
271 |     def mode(self):
272 |         return tf.round(self.ps)
273 |     def neglogp(self, x):
274 |         return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
275 |     def kl(self, other):
276 |         return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
277 |     def entropy(self):
278 |         return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
279 |     def sample(self):
280 |         u = tf.random_uniform(tf.shape(self.ps))
281 |         return tf.to_float(math_ops.less(u, self.ps))
282 |     @classmethod
283 |     def fromflat(cls, flat):
284 |         return cls(flat)
285 | 
286 | def make_pdtype(ac_space):
287 |     from gym import spaces
288 |     if isinstance(ac_space, spaces.Box):
289 |         assert len(ac_space.shape) == 1
290 |         return DiagGaussianPdType(ac_space.shape[0])
291 |     elif isinstance(ac_space, spaces.Discrete):
292 |         return CategoricalPdType(ac_space.n)
293 |     elif isinstance(ac_space, spaces.MultiDiscrete):
294 |         return MultiCategoricalPdType(ac_space.nvec)
295 |     elif isinstance(ac_space, spaces.MultiBinary):
296 |         return BernoulliPdType(ac_space.n)
297 |     else:
298 |         raise NotImplementedError
299 | 
300 | def shape_el(v, i):
301 |     maybe = v.get_shape()[i]
302 |     if maybe is not None:
303 |         return maybe
304 |     else:
305 |         return tf.shape(v)[i]
306 | 
307 | @U.in_session
308 | def test_probtypes():
309 |     np.random.seed(0)
310 | 
311 |     pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
312 |     diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101
313 |     validate_probtype(diag_gauss, pdparam_diag_gauss)
314 | 
315 |     pdparam_categorical = np.array([-.2, .3, .5])
316 |     categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
317 |     validate_probtype(categorical, pdparam_categorical)
318 | 
319 |     nvec = [1,2,3]
320 |     pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
321 |     multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
322 |     validate_probtype(multicategorical, pdparam_multicategorical)
323 | 
324 |     pdparam_bernoulli = np.array([-.2, .3, .5])
325 |     bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
326 |     validate_probtype(bernoulli, pdparam_bernoulli)
327 | 
328 | 
329 | def validate_probtype(probtype, pdparam):
330 |     N = 100000
331 |     # Check to see if mean negative log likelihood == differential entropy
332 |     Mval = np.repeat(pdparam[None, :], N, axis=0)
333 |     M = probtype.param_placeholder([N])
334 |     X = probtype.sample_placeholder([N])
335 |     pd = probtype.pdfromflat(M)
336 |     calcloglik = U.function([X, M], pd.logp(X))
337 |     calcent = U.function([M], pd.entropy())
338 |     Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
339 |     logliks = calcloglik(Xval, Mval)
340 |     entval_ll = - logliks.mean() #pylint: disable=E1101
341 |     entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
342 |     entval = calcent(Mval).mean() #pylint: disable=E1101
343 |     assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas
344 | 
345 |     # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
346 |     M2 = probtype.param_placeholder([N])
347 |     pd2 = probtype.pdfromflat(M2)
348 |     q = pdparam + np.random.randn(pdparam.size) * 0.1
349 |     Mval2 = np.repeat(q[None, :], N, axis=0)
350 |     calckl = U.function([M, M2], pd.kl(pd2))
351 |     klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
352 |     logliks = calcloglik(Xval, Mval2)
353 |     klval_ll = - entval - logliks.mean() #pylint: disable=E1101
354 |     klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
355 |     assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
356 |     print('ok on', probtype, pdparam)
357 | 
358 | 
359 | def _matching_fc(tensor, name, size, init_scale, init_bias):
360 |     if tensor.shape[-1] == size:
361 |         return tensor
362 |     else:
363 |         return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias)
364 | 


--------------------------------------------------------------------------------
/neural_networks/gym_utilities.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/gym_utilities.pyc


--------------------------------------------------------------------------------
/neural_networks/market_making_models.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Mar 25 21:42:23 2019
 5 | 
 6 | @author: tawehbeysolow
 7 | """
 8 | 
 9 | import tensorflow as tf, numpy as np
10 | from collections import deque 
11 | 
12 | 
13 | activation_dictionary = {'elu': tf.nn.elu,
14 |                          'relu': tf.nn.relu, 
15 |                          'selu': tf.nn.selu, 
16 |                          'sigmoid': tf.nn.sigmoid,
17 |                          'softmax': tf.nn.softmax,
18 |                           None: None}
19 | 
20 | def fully_connected_layer(inputs, units, activation, gain=np.sqrt(2)):
21 |     
22 |     return tf.layers.dense(inputs=inputs, 
23 |                            units=units, 
24 |                            activation=activation_dictionary[activation],
25 |                            kernel_initializer=tf.orthogonal_initializer(gain))
26 |     
27 | class Memory():
28 |     
29 |     def __init__(self, max_size):
30 |         self.buffer = deque(maxlen = max_size)
31 |     
32 |     def add(self, experience):
33 |         self.buffer.append(experience)
34 |     
35 |     def sample(self, batch_size):
36 |         buffer_size = len(self.buffer)
37 |         index = np.random.choice(np.arange(buffer_size),
38 |                                 size=batch_size,
39 |                                 replace=True)
40 |         
41 |         return [self.buffer[i] for i in index]
42 | 
43 | 
44 | class DeepQNetworkMM():
45 |     
46 |     def __init__(self, n_units, n_classes, state_size, action_size, learning_rate):
47 |         self.state_size = state_size
48 |         self.action_size = action_size
49 |         self.learning_rate = learning_rate
50 |         self.n_units = n_units
51 |         self.n_classes = n_classes
52 |         
53 |         self.input_matrix = tf.placeholder(tf.float32, [None, state_size])
54 |         self.actions = tf.placeholder(tf.float32, [None, n_classes])
55 |         self.target_Q = tf.placeholder(tf.float32, [None])
56 |         
57 |         
58 |         self.layer1 = fully_connected_layer(inputs=self.input_matrix,
59 |                                                  units=self.n_units,
60 |                                                  activation='selu')
61 |         
62 |         self.hidden_layer = fully_connected_layer(inputs=self.layer1,
63 |                                                   units=self.n_units,
64 |                                                   activation='selu')
65 |         
66 |         self.output_layer = fully_connected_layer(inputs=self.hidden_layer,
67 |                                                   units=n_classes,
68 |                                                   activation=None)
69 |         
70 |         self.predicted_Q = tf.reduce_sum(tf.multiply(self.output_layer, self.actions), axis=1)
71 |         
72 |         self.error_rate = tf.reduce_mean(tf.square(self.target_Q - self.predicted_Q))
73 |         
74 |         self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.error_rate)
75 |     
76 | 


--------------------------------------------------------------------------------
/neural_networks/market_making_models.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/market_making_models.pyc


--------------------------------------------------------------------------------
/neural_networks/models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Feb 20 21:49:13 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import tensorflow as tf, numpy as np
 10 | from baselines.common.distributions import make_pdtype
 11 |         
 12 | activation_dictionary = {'elu': tf.nn.elu,
 13 |                          'relu': tf.nn.relu, 
 14 |                          'selu': tf.nn.selu, 
 15 |                          'sigmoid': tf.nn.sigmoid,
 16 |                          'softmax': tf.nn.softmax,
 17 |                           None: None}
 18 |             
 19 | def normalized_columns_initializer(standard_deviation=1.0):
 20 |   def initializer(shape, dtype=None, partition_info=None):
 21 |     output = np.random.randn(*shape).astype(np.float32)
 22 |     output *= standard_deviation/float(np.sqrt(np.square(output).sum(axis=0, keepdims=True)))
 23 |     return tf.constant(output)
 24 |   return initializer
 25 | 
 26 | def linear_operation(x, size, name, initializer=None, bias_init=0):
 27 |   with tf.variable_scope(name):
 28 |     weights = tf.get_variable("w", [x.get_shape()[1], size], initializer=initializer)
 29 |     biases = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
 30 |     return tf.matmul(x, weights) + biases
 31 | 
 32 | def convolution_layer(inputs, dimensions, filters, kernel_size, strides, gain=np.sqrt(2), activation='relu'):
 33 |     
 34 |     if dimensions == 3:
 35 |     
 36 |         return tf.layers.conv1d(inputs=inputs,
 37 |                                 filters=filters,
 38 |                                 kernel_size=kernel_size,
 39 |                                 kernel_initializer=tf.orthogonal_initializer(gain),
 40 |                                 strides=(strides),
 41 |                                 activation=activation_dictionary[activation])
 42 |     elif dimensions == 4:
 43 |         
 44 |         return tf.layers.conv2d(inputs=inputs,
 45 |                                 filters=filters,
 46 |                                 kernel_size=kernel_size,
 47 |                                 kernel_initializer=tf.orthogonal_initializer(gain),
 48 |                                 strides=(strides),
 49 |                                 activation=activation_dictionary[activation])
 50 | 
 51 | 
 52 | def fully_connected_layer(inputs, units, activation, gain=np.sqrt(2)):
 53 |     return tf.layers.dense(inputs=inputs, 
 54 |                            units=units, 
 55 |                            activation=activation_dictionary[activation],
 56 |                            kernel_initializer=tf.orthogonal_initializer(gain))
 57 | 
 58 | def lstm_layer(input, size, actions, apply_softmax=False):
 59 |       input = tf.expand_dims(input, [0])
 60 |       lstm = tf.contrib.rnn.BasicLSTMCell(size, state_is_tuple=True)
 61 |       state_size = lstm.state_size
 62 |       step_size = tf.shape(input)[:1]
 63 |       cell_init = np.zeros((1, state_size.c), np.float32)
 64 |       hidden_init = np.zeros((1, state_size.h), np.float32)
 65 |       initial_state = [cell_init, hidden_init]
 66 |       cell_state = tf.placeholder(tf.float32, [1, state_size.c])
 67 |       hidden_state = tf.placeholder(tf.float32, [1, state_size.h])
 68 |       input_state = tf.contrib.rnn.LSTMStateTuple(cell_state, hidden_state)
 69 |       
 70 |       _outputs, states = tf.nn.dynamic_rnn(cell=lstm,
 71 |                                            inupts=input,
 72 |                                            initial_state=input_state,
 73 |                                            sequence_length=step_size,
 74 |                                            time_major=False)
 75 |       _cell_state, _hidden_state = states
 76 |       output = tf.reshape(_outputs, [-1, size])
 77 |       output_state = [_cell_state[:1, :], _hidden_state[:1, :]]
 78 |       output = linear_operation(output, actions, "logits", normalized_columns_initializer(0.01))
 79 |       output = tf.nn.softmax(output, dim=-1)
 80 |       return output, _cell_state, _hidden_state
 81 | 
 82 | def create_weights_biases(n_layers, n_units, n_columns, n_outputs):
 83 |     '''
 84 |     Creates dictionaries of variable length for differing neural network models
 85 |     
 86 |     Arguments 
 87 |     
 88 |     n_layers - int - number of layers 
 89 |     n_units - int - number of neurons within each individual layer
 90 |     n_columns - int - number of columns within dataset
 91 |     
 92 |     :return: dict (int), dict (int)
 93 |     '''
 94 |     weights, biases = {}, {}
 95 |     for i in range(n_layers):
 96 |         if i == 0: 
 97 |             weights['layer'+str(i)] = tf.Variable(tf.random_normal([n_columns, n_units]))
 98 |             biases['layer'+str(i)] = tf.Variable(tf.random_normal([n_columns]))
 99 |         elif i != 0 and i != n_layers-1:
100 |             weights['layer'+str(i)] = tf.Variable(tf.random_normal([n_units, n_units]))
101 |             biases['layer'+str(i)] = tf.Variable(tf.random_normal([n_units]))
102 |         elif i != 0 and i == n_layers-1:
103 |             weights['output_layer'] = tf.Variable(tf.random_normal([n_units, n_outputs]))
104 |             biases['output_layer'] = tf.Variable(tf.random_normal([n_outputs]))
105 |             
106 |     return weights, biases
107 | 
108 | def create_input_output(input_dtype, output_dtype, n_columns, n_outputs):
109 |     '''
110 |     Create placeholder variables for tensorflow graph
111 |     
112 |     '''   
113 |     X = tf.placeholder(shape=(None, n_columns), dtype=input_dtype)
114 |     Y = tf.placeholder(shape=(None, n_outputs), dtype=output_dtype)
115 |     return X, Y
116 | 
117 | 
118 | class DeepQNetwork():
119 |     
120 |     def __init__(self, n_units, n_classes, n_filters, stride, kernel, state_size, action_size, learning_rate):
121 |         self.state_size = state_size
122 |         self.action_size = action_size
123 |         self.learning_rate = learning_rate
124 |         self.n_units = n_units
125 |         self.n_classes = n_classes
126 |         self.n_filters = n_filters
127 |         self.stride = stride
128 |         self.kernel = kernel
129 |         
130 |         self.input_matrix = tf.placeholder(tf.float32, [None, state_size])
131 |         self.actions = tf.placeholder(tf.float32, [None, n_classes])
132 |         self.target_Q = tf.placeholder(tf.float32, [None])
133 |             
134 |         
135 |         self.network1 = convolution_layer(inputs=self.input_matrix, 
136 |                                      filters=self.n_filters, 
137 |                                      kernel_size=self.kernel, 
138 |                                      strides=self.stride,
139 |                                      dimensions=4,
140 |                                      activation='elu')
141 |         
142 |         self.network1 = tf.layers.batch_normalization(self.network1,
143 |                                                  training=True,
144 |                                                  epsilon=1e-5)    
145 | 
146 |         self.network2 = convolution_layer(inputs=self.network1, 
147 |                                      filters=self.n_filters*2, 
148 |                                      kernel_size=int(self.kernel/2), 
149 |                                      strides=int(self.stride/2), 
150 |                                      dimensions=4,
151 |                                      activation='elu')
152 |      
153 |         self.network2 = tf.layers.batch_normalization(inputs=self.network2,
154 |                                                  training=True,
155 |                                                  epsilon=1e-5)
156 | 
157 |         self.network3 = convolution_layer(inputs=self.network2, 
158 |                                      filters=self.n_filters*4, 
159 |                                      kernel_size=int(self.kernel/2), 
160 |                                      strides=int(self.stride/2),
161 |                                      dimensions=4,
162 |                                      activation='elu')
163 |      
164 |         self.network3 = tf.layers.batch_normalization(inputs=self.network3,
165 |                                                       training=True,
166 |                                                       epsilon=1e-5)
167 | 
168 |         self.network3 = tf.layers.flatten(inputs=self.network3)
169 |         
170 |         self.output = fully_connected_layer(inputs=self.network3, 
171 |                                             units=self.n_units,
172 |                                             activation='elu')
173 |         
174 |         self.output = fully_connected_layer(inputs=self.output,
175 |                                             units=n_classes,
176 |                                             activation=None)
177 |         
178 |         self.predicted_Q = tf.reduce_sum(tf.multiply(self.output, self.actions), axis=1)
179 |         
180 |         self.error_rate = tf.reduce_mean(tf.square(self.target_Q - self.predicted_Q))
181 |         
182 |         self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.error_rate)
183 |  
184 |     
185 | class ActorCriticModel():
186 |     
187 |     def __init__(self, session, environment, action_space, n_batches, n_steps, reuse=False):
188 |         
189 |         session.run(tf.global_variables_initializer())
190 |         self.distribution_type = make_pdtype(action_space)
191 |         height, weight, channel = environment.shape
192 |         inputs_ = tf.placeholder(tf.float32, [height, weight, channel], name='inputs')
193 |         scaled_images = tf.cast(inputs_, tf.float32)/float(255)
194 |         
195 |         with tf.variable_scope('model', reuse=reuse):
196 | 
197 |             layer1 = tf.layers.batch_normalization(convolution_layer(inputs=scaled_images, 
198 |                                                                      filters=32, 
199 |                                                                      kernel_size=8, 
200 |                                                                      strides=4,
201 |                                                                      dimensions=3))
202 |                         
203 |             layer2 = tf.layers.batch_normalization(convolution_layer(inputs=tf.nn.relu(layer1), 
204 |                                                                      filters=64, 
205 |                                                                      kernel_size=4, 
206 |                                                                      strides=2,
207 |                                                                      dimensions=3))
208 |             
209 |             layer3 = tf.layers.batch_normalization(convolution_layer(inputs=tf.nn.relu(layer2), 
210 |                                                                      filters=64, 
211 |                                                                      kernel_size=3, 
212 |                                                                      strides=1,
213 |                                                                      dimensions=3))
214 |             
215 |             layer3 = tf.layers.flatten(inputs=layer3)
216 |             output_layer = fully_connected_layer(inputs=layer3, units=512, activation='softmax')
217 |             self.distribution, self.logits = self.distribution_type.pdfromlatent(output_layer, init_scale=0.01)
218 |             value_function = fully_connected_layer(output_layer, units=1, activation=None)[:, 0]
219 |             
220 |         self.initial_state = None
221 |         sampled_action = self.distribution.sample()
222 |         
223 |         def step(current_state, *_args, **_kwargs):
224 |             action, value = session.run([sampled_action, value_function], {inputs_: current_state})
225 |             return action, value
226 | 
227 |         def value(current_state, *_args, **_kwargs):
228 |             return session.run(value_function, {inputs_: current_state})
229 | 
230 |         def select_action(current_state, *_args, **_kwargs):
231 |             return session.run(sampled_action, {inputs_: current_state})
232 | 
233 |         self.inputs_ = inputs_
234 |         self.value_function = value_function
235 |         self.step = step
236 |         self.value = value
237 |         self.select_action = select_action
238 |     
239 |         
240 | class A3CModel():
241 |     
242 |     def __init__(self, s_size, a_size, scope, trainer):
243 |         
244 |         with tf.variable_scope(scope):
245 | 
246 |             self.input_layer = tf.placeholder(shape=[None, s_size], 
247 |                                          dtype=tf.float32)
248 |             
249 |             self.input_layer = tf.reshape(self.input_layer, 
250 |                                           shape=[-1,84,84,1])
251 |             
252 |             self.layer1 = tf.layers.batch_normalization(convolution_layer(inputs=input_layer, 
253 |                                                                      filters=32, 
254 |                                                                      kernel_size=8, 
255 |                                                                      strides=4,
256 |                                                                      dimensions=3))
257 |                         
258 |             self.layer2 = tf.layers.batch_normalization(convolution_layer(inputs=tf.nn.relu(layer1), 
259 |                                                                      filters=64, 
260 |                                                                      kernel_size=4, 
261 |                                                                      strides=2,
262 |                                                                      dimensions=3))
263 |             
264 |             layer3 = tf.layers.flatten(inputs=layer3)
265 |             
266 |             output_layer = fully_connected_layer(inputs=layer3, 
267 |                                                  units=512, 
268 |                                                  activation='softmax')
269 |             
270 |             outputs, cell_state, hidden_state = lstm_layer(input=hidden, 
271 |                                                            size=s_size, 
272 |                                                            actions=a_size, 
273 |                                                            apply_softmax=False)
274 |                     
275 |             self.state_out = (cell_state[:1, :], hidden_state[:1, :])
276 |             ouptut_layer = tf.reshape(outputs, [-1, 256])
277 |             
278 |             self.policy = slim.fully_connected(input=output_layer, 
279 |                                                n_units=a_size,
280 |                                                activation_fn=tf.nn.softmax,
281 |                                                weights_initializer=normalized_columns_initializer(0.01),
282 |                                                biases_initializer=None)
283 |             
284 |             self.value = slim.fully_connected(input=rnn_out,
285 |                                               n_units=1,
286 |                                               activation_fn=None,
287 |                                               weights_initializer=normalized_columns_initializer(1.0),
288 |                                               biases_initializer=None)
289 |             
290 |             if scope != 'global':
291 |                 self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
292 |                 self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
293 |                 self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
294 |                 self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)
295 | 
296 |                 self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])
297 | 
298 |                 self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
299 |                 self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
300 |                 self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
301 |                 self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01
302 | 
303 |                 local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
304 |                 self.gradients = tf.gradients(self.loss, local_vars)
305 |                 self.var_norms = tf.global_norm(local_vars)
306 |                 grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)
307 |                 
308 |                 global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
309 |                 self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))
310 | 
311 | 
312 | 
313 |         


--------------------------------------------------------------------------------
/neural_networks/models.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/models.pyc


--------------------------------------------------------------------------------
/neural_networks/policy_gradient_utilities.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Mar 25 15:22:27 2019
 5 | 
 6 | @author: tawehbeysolow
 7 | """
 8 | 
 9 | import keras.layers as layers
10 | from keras import backend
11 | from keras.models import Model
12 | from keras.optimizers import Adam
13 | from keras.initializers import glorot_uniform
14 | 
15 | class PolicyGradient():
16 |     
17 |     def __init__(self, n_units, n_layers, n_columns, n_outputs, learning_rate, hidden_activation, output_activation, loss_function):
18 |         self.n_units = n_units
19 |         self.n_layers = n_layers
20 |         self.n_columns = n_columns
21 |         self.n_outputs = n_outputs
22 |         self.hidden_activation = hidden_activation
23 |         self.output_activation = output_activation
24 |         self.learning_rate = learning_rate
25 |         self.loss_function = loss_function
26 | 
27 |     def create_policy_model(self, input_shape):
28 |         input_layer = layers.Input(shape=input_shape)
29 |         advantages = layers.Input(shape=[1])
30 |         
31 |         hidden_layer = layers.Dense(units=self.n_units, 
32 |                                     activation=self.hidden_activation,
33 |                                     use_bias=False,
34 |                                     kernel_initializer=glorot_uniform(seed=42))(input_layer)
35 |         
36 |         output_layer = layers.Dense(units=self.n_outputs, 
37 |                                     activation=self.output_activation,
38 |                                     use_bias=False,
39 |                                     kernel_initializer=glorot_uniform(seed=42))(hidden_layer)
40 |         
41 |         def log_likelihood_loss(actual_labels, predicted_labels):
42 |             log_likelihood = backend.log(actual_labels * (actual_labels - predicted_labels) + 
43 |                                   (1 - actual_labels) * (actual_labels + predicted_labels))
44 |             return backend.mean(log_likelihood * advantages, keepdims=True)
45 |         
46 |         if self.loss_function == 'log_likelihood':
47 |             self.loss_function = log_likelihood_loss
48 |         else:
49 |             self.loss_function = 'categorical_crossentropy'
50 |                 
51 |         policy_model = Model(inputs=[input_layer, advantages], outputs=output_layer)
52 |         policy_model.compile(loss=self.loss_function, optimizer=Adam(self.learning_rate))
53 |         model_prediction = Model(input=[input_layer], outputs=output_layer)
54 |         return policy_model, model_prediction
55 | 


--------------------------------------------------------------------------------
/neural_networks/policy_gradient_utilities.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/policy_gradient_utilities.pyc


--------------------------------------------------------------------------------
/neural_networks/untitled4.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Feb 25 12:16:24 2019
  5 | 
  6 | @author: tawehbeysolow
  7 | """
  8 | 
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | import gym
 13 | env = gym.make("CartPole-v0")
 14 | 
 15 | # Constants defining our neural network
 16 | hidden_layer_neurons = 8
 17 | gamma = .99
 18 | dimen = len(env.reset())
 19 | print_every = 100
 20 | batch_size = 50
 21 | num_episodes = 10000
 22 | render = False
 23 | lr = 1e-2
 24 | goal = 190
 25 | 
 26 | import keras.layers as layers
 27 | from keras.models import Model
 28 | from keras.optimizers import Adam
 29 | import keras.backend as K
 30 | from keras.initializers import glorot_uniform
 31 | 
 32 | def get_policy_model(env, hidden_layer_neurons, lr):
 33 |     dimen = env.reset().shape
 34 |     num_actions = env.action_space.n
 35 |     inp = layers.Input(shape=dimen,name="input_x")
 36 |     adv = layers.Input(shape=[1], name="advantages")
 37 |     x = layers.Dense(hidden_layer_neurons, 
 38 |                      activation="relu", 
 39 |                      use_bias=False,
 40 |                      kernel_initializer=glorot_uniform(seed=42),
 41 |                      name="dense_1")(inp)
 42 |     out = layers.Dense(num_actions, 
 43 |                        activation="softmax", 
 44 |                        kernel_initializer=glorot_uniform(seed=42),
 45 |                        use_bias=False,
 46 |                        name="out")(x)
 47 | 
 48 |     def custom_loss(y_true, y_pred):
 49 |         # actual: 0 predict: 0 -> log(0 * (0 - 0) + (1 - 0) * (0 + 0)) = -inf
 50 |         # actual: 1 predict: 1 -> log(1 * (1 - 1) + (1 - 1) * (1 + 1)) = -inf
 51 |         # actual: 1 predict: 0 -> log(1 * (1 - 0) + (1 - 1) * (1 + 0)) = 0
 52 |         # actual: 0 predict: 1 -> log(0 * (0 - 1) + (1 - 0) * (0 + 1)) = 0
 53 |         log_lik = K.log(y_true * (y_true - y_pred) + (1 - y_true) * (y_true + y_pred))
 54 |         return K.mean(log_lik * adv, keepdims=True)
 55 |         
 56 |     model_train = Model(inputs=[inp, adv], outputs=out)
 57 |     model_train.compile(loss=custom_loss, optimizer=Adam(lr))
 58 |     model_predict = Model(inputs=[inp], outputs=out)
 59 |     return model_train, model_predict
 60 | 
 61 | def discount_rewards(r, gamma=0.99):
 62 |     """Takes 1d float array of rewards and computes discounted reward
 63 |     e.g. f([1, 1, 1], 0.99) -> [2.9701, 1.99, 1]
 64 |     """
 65 |     prior = 0
 66 |     out = []
 67 |     for val in r:
 68 |         new_val = val + prior * gamma
 69 |         out.append(new_val)
 70 |         prior = new_val
 71 |     return np.array(out[::-1])
 72 | 
 73 | # See our trained bot in action
 74 | def score_model(model, num_tests, render=False):
 75 |     scores = []    
 76 |     for num_test in range(num_tests):
 77 |         observation = env.reset()
 78 |         reward_sum = 0
 79 |         while True:
 80 |             if render:
 81 |                 env.render()
 82 | 
 83 |             state = np.reshape(observation, [1, dimen])
 84 |             predict = model.predict([state])[0]
 85 |             action = np.argmax(predict)
 86 |             observation, reward, done, _ = env.step(action)
 87 |             reward_sum += reward
 88 |             if done:
 89 |                 break
 90 |         scores.append(reward_sum)
 91 |     env.close()
 92 |     return np.mean(scores)
 93 | 
 94 | model_train, model_predict = get_policy_model(env, hidden_layer_neurons, lr)
 95 | model_predict.summary()
 96 | 
 97 | reward_sum = 0
 98 | 
 99 | num_actions = env.action_space.n
100 | 
101 | # Placeholders for our observations, outputs and rewards
102 | states = np.empty(0).reshape(0,dimen)
103 | actions = np.empty(0).reshape(0,1)
104 | rewards = np.empty(0).reshape(0,1)
105 | discounted_rewards = np.empty(0).reshape(0,1)
106 | 
107 | # Setting up our environment
108 | observation = env.reset()
109 | 
110 | num_episode = 0
111 | 
112 | losses = []
113 | 
114 | while num_episode < num_episodes:
115 |     # Append the observations to our batch
116 |     state = np.reshape(observation, [1, dimen])
117 |     
118 |     predict = model_predict.predict([state])[0]
119 |     action = np.random.choice(range(num_actions),p=predict)
120 |     
121 |     # Append the observations and outputs for learning
122 |     states = np.vstack([states, state])
123 |     actions = np.vstack([actions, action])
124 |     
125 |     # Determine the oucome of our action
126 |     observation, reward, done, _ = env.step(action)
127 |     reward_sum += reward
128 |     rewards = np.vstack([rewards, reward])
129 |     
130 |     if done:
131 |         # Determine standardized rewards
132 |         discounted_rewards_episode = discount_rewards(rewards, gamma)       
133 |         discounted_rewards = np.vstack([discounted_rewards, discounted_rewards_episode])
134 |         
135 |         rewards = np.empty(0).reshape(0,1)
136 | 
137 |         if (num_episode + 1) % batch_size == 0:
138 |             discounted_rewards -= discounted_rewards.mean()
139 |             discounted_rewards /= discounted_rewards.std()
140 |             discounted_rewards = discounted_rewards.squeeze()
141 |             actions = actions.squeeze().astype(int)
142 |            
143 |             actions_train = np.zeros([len(actions), num_actions])
144 |             actions_train[np.arange(len(actions)), actions] = 1
145 |             
146 |             loss = model_train.train_on_batch([states, discounted_rewards], actions_train)
147 |             losses.append(loss)
148 | 
149 |             # Clear out game variables
150 |             states = np.empty(0).reshape(0,dimen)
151 |             actions = np.empty(0).reshape(0,1)
152 |             discounted_rewards = np.empty(0).reshape(0,1)
153 | 
154 | 
155 |         # Print periodically
156 |         if (num_episode + 1) % print_every == 0:
157 |             # Print status
158 |             score = score_model(model_predict,10)
159 |             print("Average reward for training episode {}: {:0.2f} Test Score: {:0.2f} Loss: {:0.6f} ".format(
160 |                 (num_episode + 1), reward_sum/print_every, 
161 |                 score,
162 |                 np.mean(losses[-print_every:])))
163 |             
164 |             if score >= goal:
165 |                 print("Solved in {} episodes!".format(num_episode))
166 |                 break
167 |             reward_sum = 0
168 |                 
169 |         num_episode += 1
170 |         observation = env.reset()
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gym
2 | box2d-py
3 | vizdoom
4 | tensorflow-gpu
5 | baselines
6 | collections
7 | keras
8 | 


--------------------------------------------------------------------------------