├── .DS_Store ├── .gitattributes ├── 9781484251263.jpg ├── Contributing.md ├── LICENSE.txt ├── README.md ├── __init__.py ├── _vizdoom.ini ├── algorithms ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── actor_critic_utilities.cpython-36.pyc │ ├── dql_utilities.cpython-36.pyc │ ├── order_book_data.cpython-36.pyc │ └── trading.cpython-36.pyc ├── actor_critic_utilities.py ├── distributions.py ├── distributions.pyc ├── dql_utilities.py ├── dql_utilities.pyc ├── order_book_data.py ├── order_book_data.pyc ├── policy_gradient_utilities.py ├── sarsa_algorithm.py ├── trading.py └── trading.pyc ├── chapter1 ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── open_ai_gym_example.cpython-36.pyc └── open_ai_gym_example.py ├── chapter2 ├── .DS_Store ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── super_mario_example.cpython-36.pyc ├── cart_pole_example.py ├── cart_pole_example.pyc └── super_mario_example.py ├── chapter3 ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── doom_example.cpython-36.pyc │ └── frozen_lake_example.cpython-36.pyc ├── basic.cfg ├── basic.wad ├── doom_example.py └── frozen_lake_example.py ├── chapter4 ├── .DS_Store ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── market_making_example.cpython-36.pyc └── market_making_example.py ├── chapter5 ├── .DS_Store ├── __init__.py ├── create_environment.py └── sonic_example.py ├── errata.md ├── neural_networks ├── Figure_1-1.png ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── models.cpython-36.pyc ├── gym_utilities.py ├── gym_utilities.pyc ├── market_making_models.py ├── market_making_models.pyc ├── models.py ├── models.pyc ├── policy_gradient_utilities.py ├── policy_gradient_utilities.pyc └── untitled4.py └── requirements.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /9781484251263.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/9781484251263.jpg -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Apress Source Code 2 | 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers. 4 | 5 | ## How to Contribute 6 | 7 | 1. Make sure you have a GitHub account. 8 | 2. Fork the repository for the relevant book. 9 | 3. Create a new branch on which to make your change, e.g. 10 | `git checkout -b my_code_contribution` 11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted. 12 | 5. Submit a pull request. 13 | 14 | Thank you for your contribution! -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Freeware License, some rights reserved 2 | 3 | Copyright (c) 2019 Taweh Beysolow 4 | 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 6 | of this software and associated documentation files (the "Software"), 7 | to work with the Software within the limits of freeware distribution and fair use. 8 | This includes the rights to use, copy, and modify the Software for personal use. 9 | Users are also allowed and encouraged to submit corrections and modifications 10 | to the Software for the benefit of other users. 11 | 12 | It is not allowed to reuse, modify, or redistribute the Software for 13 | commercial use in any way, or for a user’s educational materials such as books 14 | or blog articles without prior permission from the copyright holder. 15 | 16 | The above copyright notice and this permission notice need to be included 17 | in all copies or substantial portions of the software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | 27 | 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apress Source Code 2 | 3 | This repository accompanies [*Applied Reinforcement Learning with Python*](https://www.apress.com/9781484251263) by Taweh Beysolow (Apress, 2019). 4 | 5 | [comment]: #cover 6 | ![Cover image](9781484251263.jpg) 7 | 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git. 9 | 10 | ## Releases 11 | 12 | Release v1.0 corresponds to the code in the published book, without corrections or updates. 13 | 14 | ## Contributions 15 | 16 | See the file Contributing.md for more information on how you can contribute to this repository. -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /_vizdoom.ini: -------------------------------------------------------------------------------- 1 | # This file was generated by ViZDoom 1.1.7 (ZDOOM 2.8.1) on Fri Mar 22 14:26:42 2019 2 | 3 | # These are the directories to automatically search for IWADs. 4 | # Each directory should be on a separate line, preceded by Path= 5 | [IWADSearch.Directories] 6 | Path=. 7 | Path=$DOOMWADDIR 8 | Path=/Users/tawehbeysolow/Documents/_vizdoom 9 | Path=/Users/tawehbeysolow/Library/Application Support/_vizdoom 10 | Path=$PROGDIR 11 | Path=/Library/Application Support/_vizdoom 12 | 13 | # These are the directories to search for wads added with the -file 14 | # command line parameter, if they cannot be found with the path 15 | # as-is. Layout is the same as for IWADSearch.Directories 16 | [FileSearch.Directories] 17 | Path=$PROGDIR 18 | Path=/Library/Application Support/_vizdoom 19 | Path=$DOOMWADDIR 20 | 21 | # Files to automatically execute when running the corresponding game. 22 | # Each file should be on its own line, preceded by Path= 23 | 24 | [Doom.AutoExec] 25 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg 26 | 27 | [Heretic.AutoExec] 28 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg 29 | 30 | [Hexen.AutoExec] 31 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg 32 | 33 | [Strife.AutoExec] 34 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg 35 | 36 | [Chex.AutoExec] 37 | Path=/Users/tawehbeysolow/Documents/_vizdoom/autoexec.cfg 38 | 39 | # WAD files to always load. These are loaded after the IWAD but before 40 | # any files added with -file. Place each file on its own line, preceded 41 | # by Path= 42 | [Global.Autoload] 43 | 44 | # Wad files to automatically load depending on the game and IWAD you are 45 | # playing. You may have have files that are loaded for all similar IWADs 46 | # (the game) and files that are only loaded for particular IWADs. For example, 47 | # any files listed under 'doom.Autoload' will be loaded for any version of Doom, 48 | # but files listed under 'doom.doom2.Autoload' will only load when you are 49 | # playing a Doom 2 based game (doom2.wad, tnt.wad or plutonia.wad), and files listed under 50 | # 'doom.doom2.commercial.Autoload' only when playing doom2.wad. 51 | 52 | [doom.Autoload] 53 | 54 | [doom.doom2.Autoload] 55 | 56 | [doom.doom2.commercial.Autoload] 57 | 58 | [doom.doom2.bfg.Autoload] 59 | 60 | [doom.doom2.plutonia.Autoload] 61 | 62 | [doom.doom2.tnt.Autoload] 63 | 64 | [doom.doom1.Autoload] 65 | 66 | [doom.doom1.registered.Autoload] 67 | 68 | [doom.doom1.ultimate.Autoload] 69 | 70 | [doom.doom1.bfg.Autoload] 71 | 72 | [doom.freedoom.Autoload] 73 | 74 | [doom.freedoom.demo.Autoload] 75 | 76 | [doom.freedoom.phase1.Autoload] 77 | 78 | [doom.freedoom.phase2.Autoload] 79 | 80 | [doom.freedoom.freedm.Autoload] 81 | 82 | [heretic.Autoload] 83 | 84 | [heretic.heretic.Autoload] 85 | 86 | [heretic.shadow.Autoload] 87 | 88 | [blasphemer.Autoload] 89 | 90 | [hexen.Autoload] 91 | 92 | [hexen.deathkings.Autoload] 93 | 94 | [hexen.hexen.Autoload] 95 | 96 | [strife.Autoload] 97 | 98 | [chex.Autoload] 99 | 100 | [chex.chex1.Autoload] 101 | 102 | [chex.chex3.Autoload] 103 | 104 | [urbanbrawl.Autoload] 105 | 106 | [hacx.Autoload] 107 | 108 | [hacx.hacx1.Autoload] 109 | 110 | [hacx.hacx2.Autoload] 111 | 112 | [harmony.Autoload] 113 | 114 | [square.Autoload] 115 | 116 | [square.squareware.Autoload] 117 | 118 | [square.square.Autoload] 119 | 120 | [LastRun] 121 | Version=211 122 | 123 | [GlobalSettings] 124 | gus_memsize=0 125 | midi_dmxgus=true 126 | gus_patchdir= 127 | midi_voices=32 128 | midi_config=timidity.cfg 129 | snd_efx=true 130 | snd_aldevice=Default 131 | wildmidi_enhanced_resampling=true 132 | wildmidi_reverb=false 133 | wildmidi_frequency=0 134 | wildmidi_config= 135 | fluid_chorus_type=0 136 | fluid_chorus_depth=8 137 | fluid_chorus_speed=0.3 138 | fluid_chorus_level=1 139 | fluid_chorus_voices=3 140 | fluid_reverb_level=0.57 141 | fluid_reverb_width=0.76 142 | fluid_reverb_damping=0.23 143 | fluid_reverb_roomsize=0.61 144 | fluid_threads=1 145 | fluid_samplerate=0 146 | fluid_interp=1 147 | fluid_voices=128 148 | fluid_chorus=true 149 | fluid_reverb=true 150 | fluid_gain=0.5 151 | fluid_patchset= 152 | opl_core=0 153 | opl_numchips=2 154 | timidity_frequency=44100 155 | timidity_pipe=90 156 | timidity_mastervolume=1 157 | timidity_byteswap=false 158 | timidity_8bit=false 159 | timidity_stereo=true 160 | timidity_reverb=0 161 | timidity_chorus=0 162 | timidity_extargs= 163 | timidity_exe=timidity 164 | snd_mididevice=-1 165 | spc_amp=1.875 166 | mod_dumb_mastervolume=1 167 | mod_autochip_scan_threshold=12 168 | mod_autochip_size_scan=500 169 | mod_autochip_size_force=100 170 | mod_autochip=false 171 | mod_interp=2 172 | mod_volramp=2 173 | mod_samplerate=0 174 | mod_dumb=true 175 | snd_sfxvolume=1 176 | snd_backend=openal 177 | snd_output=default 178 | snd_buffersize=0 179 | snd_samplerate=0 180 | snd_musicvolume=0.5 181 | snd_waterlp=250 182 | snd_midipatchset= 183 | snd_output_format=PCM-16 184 | snd_speakermode=Auto 185 | snd_resampler=Linear 186 | snd_waterreverb=true 187 | snd_hrtf=false 188 | snd_buffercount=0 189 | snd_driver=0 190 | opl_fullpan=true 191 | vid_tft=true 192 | m_showinputgrid=false 193 | m_show_backbutton=0 194 | m_use_mouse=1 195 | show_messages=true 196 | mouse_sensitivity=1 197 | map_point_coordinates=true 198 | vid_aspect=3 199 | vid_nowidescreen=false 200 | vid_refreshrate=0 201 | vid_vsync=false 202 | vid_defbits=8 203 | vid_defheight=480 204 | vid_defwidth=640 205 | Gamma=1 206 | statfile=zdoomstat.txt 207 | savestatistics=0 208 | snd_flipstereo=false 209 | snd_channels=32 210 | r_columnmethod=1 211 | r_quakeintensity=1 212 | cl_predict_lerpthreshold=2 213 | cl_predict_lerpscale=0.05 214 | cl_predict_specials=true 215 | cl_noprediction=false 216 | telezoom=true 217 | r_fakecontrast=1 218 | chase_dist=90 219 | chase_height=-8 220 | gl_cachetime=0.6 221 | gl_cachenodes=true 222 | nomonsterinterpolation=false 223 | png_gamma=0 224 | png_level=5 225 | screenshot_dir= 226 | screenshot_type=png 227 | screenshot_quiet=false 228 | use_joystick=false 229 | autosavecount=4 230 | disableautosave=0 231 | autosavenum=0 232 | smooth_mouse=false 233 | m_side=2 234 | m_forward=1 235 | m_yaw=1 236 | m_pitch=1 237 | lookstrafe=false 238 | freelook=false 239 | invertmouse=false 240 | cl_run=false 241 | demo_compress=true 242 | cl_waitforsave=true 243 | save_dir= 244 | longsavemessages=true 245 | storesavepic=true 246 | nofilecompression=false 247 | cl_capfps=true 248 | defaultiwad= 249 | queryiwad=true 250 | con_ctrl_d= 251 | con_buffersize=-1 252 | osx_additional_parameters= 253 | showendoom=0 254 | bgamma=1 255 | ggamma=1 256 | rgamma=1 257 | vid_forcesurface=false 258 | vid_displaybits=32 259 | vid_adapter=0 260 | mouse_capturemode=1 261 | m_filter=false 262 | m_noprescale=false 263 | use_mouse=false 264 | vid_winscale=1 265 | fullscreen=false 266 | vid_maxfps=200 267 | 268 | [GlobalSettings.Unknown] 269 | 270 | [Doom.Player] 271 | wi_noautostartmap=false 272 | playerclass=Fighter 273 | stillbob=0 274 | movebob=0.25 275 | neverswitchonpickup=false 276 | gender=male 277 | team=255 278 | skin=base 279 | colorset=0 280 | color=40 cf 00 281 | name=Player 282 | autoaim=35 283 | 284 | [Doom.ConsoleVariables] 285 | r_drawfuzz=1 286 | vid_nopalsubstitutions=false 287 | snd_pitched=false 288 | menu_screenratios=-1 289 | snd_menuvolume=0.6 290 | show_obituaries=true 291 | am_showmaplabel=2 292 | crosshairgrow=false 293 | crosshairscale=false 294 | crosshairhealth=true 295 | crosshaircolor=ff 00 00 296 | crosshairforce=false 297 | crosshair=0 298 | st_scale=true 299 | paletteflash=0 300 | hudcolor_stats=3 301 | hudcolor_statnames=6 302 | hudcolor_xyco=3 303 | hudcolor_ttim=5 304 | hudcolor_ltim=8 305 | hudcolor_time=6 306 | hudcolor_titl=10 307 | hud_berserk_health=true 308 | hud_armor_green=100 309 | hud_armor_yellow=50 310 | hud_armor_red=25 311 | hud_health_green=100 312 | hud_health_yellow=50 313 | hud_health_red=25 314 | hud_ammo_yellow=50 315 | hud_ammo_red=25 316 | hud_showlag=0 317 | hud_timecolor=5 318 | hud_showtime=0 319 | hud_showammo=2 320 | hud_showweapons=true 321 | hud_showscore=false 322 | hud_showstats=false 323 | hud_showitems=false 324 | hud_showmonsters=true 325 | hud_showsecrets=true 326 | hud_althud=false 327 | hud_althudscale=2 328 | st_oldouch=false 329 | cl_maxdecals=1024 330 | cl_spreaddecals=true 331 | transsouls=0.75 332 | wi_showtotaltime=true 333 | wi_percents=true 334 | dimcolor=ff d7 00 335 | dimamount=-1 336 | hud_scale=true 337 | allcheats=false 338 | r_stretchsky=true 339 | r_shadercolormaps=true 340 | screenblocks=10 341 | r_deathcamera=false 342 | cl_showsecretmessage=true 343 | cl_bloodtype=1 344 | cl_pufftype=0 345 | addrocketexplosion=false 346 | cl_missiledecals=true 347 | cl_doautoaim=false 348 | cl_bloodsplats=true 349 | cl_showmultikills=false 350 | cl_showsprees=false 351 | r_maxparticles=4092 352 | r_rail_trailsparsity=1 353 | r_rail_spiralsparsity=1 354 | r_rail_smartspiral=false 355 | cl_rockettrails=3 356 | dlg_musicvolume=1 357 | sb_teamdeathmatch_headingcolor=6 358 | sb_teamdeathmatch_enable=true 359 | sb_deathmatch_otherplayercolor=2 360 | sb_deathmatch_yourplayercolor=3 361 | sb_deathmatch_headingcolor=6 362 | sb_deathmatch_enable=true 363 | sb_cooperative_otherplayercolor=2 364 | sb_cooperative_yourplayercolor=3 365 | sb_cooperative_headingcolor=6 366 | sb_cooperative_enable=true 367 | nametagcolor=5 368 | displaynametags=0 369 | language=auto 370 | compatmode=0 371 | vid_cursor=None 372 | wipetype=0 373 | dehload=0 374 | chat_substitution=false 375 | chatmacro0=No 376 | chatmacro9=Yes 377 | chatmacro8=I'll take care of it. 378 | chatmacro7=Come here! 379 | chatmacro6=Next time, scumbag... 380 | chatmacro5=You suck! 381 | chatmacro4=Help! 382 | chatmacro3=I'm not looking too good! 383 | chatmacro2=I'm OK. 384 | chatmacro1=I'm ready to kick butt! 385 | lookspring=true 386 | con_midtime=0 387 | msgmidcolor2=4 388 | msgmidcolor=5 389 | msg4color=3 390 | msg3color=3 391 | msg2color=2 392 | msg1color=5 393 | msg0color=6 394 | msg=0 395 | con_alpha=0.75 396 | con_scaletext=0 397 | con_centernotify=false 398 | con_notifytime=0 399 | con_notablist=false 400 | cl_bbannounce=false 401 | am_followplayer=true 402 | am_textured=true 403 | am_ovthingcolor_citem=e8 88 00 404 | am_ovthingcolor_item=e8 88 00 405 | am_ovthingcolor_ncmonster=e8 88 00 406 | am_ovthingcolor_monster=e8 88 00 407 | am_ovthingcolor_friend=e8 88 00 408 | am_ovthingcolor=e8 88 00 409 | am_ovsecretsectorcolor=00 ff ff 410 | am_ovinterlevelcolor=ff ff 00 411 | am_ovtelecolor=ff ff 00 412 | am_ovunseencolor=00 22 6e 413 | am_ovcdwallcolor=00 88 44 414 | am_ovfdwallcolor=00 88 44 415 | am_ovefwallcolor=00 88 44 416 | am_ovlockedcolor=00 88 44 417 | am_ovotherwallscolor=00 88 44 418 | am_ovspecialwallcolor=ff ff ff 419 | am_ovsecretwallcolor=00 88 44 420 | am_ovwallcolor=00 ff 00 421 | am_ovyourcolor=fc e8 d8 422 | am_thingcolor_citem=fc fc fc 423 | am_thingcolor_item=fc fc fc 424 | am_thingcolor_ncmonster=fc fc fc 425 | am_thingcolor_monster=fc fc fc 426 | am_thingcolor_friend=fc fc fc 427 | am_secretsectorcolor=ff 00 ff 428 | am_interlevelcolor=ff 00 00 429 | am_intralevelcolor=00 00 ff 430 | am_lockedcolor=00 78 00 431 | am_notseencolor=6c 6c 6c 432 | am_xhaircolor=80 80 80 433 | am_gridcolor=8b 5a 2b 434 | am_thingcolor=fc fc fc 435 | am_efwallcolor=66 55 55 436 | am_cdwallcolor=4c 38 20 437 | am_fdwallcolor=88 70 58 438 | am_tswallcolor=88 88 88 439 | am_specialwallcolor=ff ff ff 440 | am_secretwallcolor=00 00 00 441 | am_wallcolor=2c 18 08 442 | am_yourcolor=fc e8 d8 443 | am_backcolor=6c 54 40 444 | am_showthingsprites=0 445 | am_showtriggerlines=true 446 | am_showkeys=true 447 | am_drawmapback=0 448 | am_map_secrets=1 449 | am_customcolors=true 450 | am_colorset=0 451 | am_showtotaltime=false 452 | am_showtime=false 453 | am_showitems=false 454 | am_showmonsters=false 455 | am_showsecrets=false 456 | am_overlay=0 457 | am_rotate=0 458 | 459 | [Doom.LocalServerInfo] 460 | sv_corpsequeuesize=64 461 | forcewater=false 462 | sv_smartaim=0 463 | sv_disableautohealth=false 464 | sv_dropstyle=0 465 | compatflags2=0 466 | compatflags=0 467 | 468 | [Doom.UnknownConsoleVariables] 469 | 470 | [Doom.ConsoleAliases] 471 | 472 | [Doom.Bindings] 473 | 1=slot 1 474 | 2=slot 2 475 | 3=slot 3 476 | 4=slot 4 477 | 5=slot 5 478 | 6=slot 6 479 | 7=slot 7 480 | 8=slot 8 481 | 9=slot 9 482 | 0=slot 0 483 | -=sizedown 484 | Equals=sizeup 485 | tab=togglemap 486 | t=messagemode 487 | LeftBracket=invprev 488 | RightBracket=invnext 489 | enter=invuse 490 | ctrl=+attack 491 | `=toggleconsole 492 | shift=+speed 493 | \=+showscores 494 | ,=+moveleft 495 | .=+moveright 496 | alt=+strafe 497 | space=+use 498 | capslock=toggle cl_run 499 | f1=menu_help 500 | f2=menu_save 501 | f3=menu_load 502 | f4=menu_options 503 | f5=menu_display 504 | f6=quicksave 505 | f7=menu_endgame 506 | f8=togglemessages 507 | f9=quickload 508 | f10=menu_quit 509 | f11=bumpgamma 510 | f12=spynext 511 | sysrq=screenshot 512 | pause=pause 513 | home=land 514 | uparrow=+forward 515 | pgup=+moveup 516 | leftarrow=+left 517 | rightarrow=+right 518 | end=centerview 519 | downarrow=+back 520 | pgdn=+lookup 521 | ins=+movedown 522 | del=+lookdown 523 | mouse1=+attack 524 | mouse2=+strafe 525 | mouse3=+forward 526 | mouse4=+speed 527 | joy1=+attack 528 | joy2=+strafe 529 | joy3=+speed 530 | joy4=+use 531 | mwheelup=weapprev 532 | mwheeldown=weapnext 533 | mwheelright=invnext 534 | mwheelleft=invprev 535 | dpadup=togglemap 536 | dpaddown=invuse 537 | dpadleft=invprev 538 | dpadright=invnext 539 | pad_start=pause 540 | pad_back=menu_main 541 | lthumb=crouch 542 | lshoulder=weapprev 543 | rshoulder=weapnext 544 | ltrigger=+altattack 545 | rtrigger=+attack 546 | pad_a=+use 547 | pad_y=+jump 548 | 549 | [Doom.DoubleBindings] 550 | 551 | [Doom.AutomapBindings] 552 | 0=am_gobig 553 | -=+am_zoomout 554 | Equals=+am_zoomin 555 | p=am_toggletexture 556 | f=am_togglefollow 557 | g=am_togglegrid 558 | c=am_clearmarks 559 | m=am_setmark 560 | kp-=+am_zoomout 561 | kp+=+am_zoomin 562 | uparrow=+am_panup 563 | leftarrow=+am_panleft 564 | rightarrow=+am_panright 565 | downarrow=+am_pandown 566 | mwheelup=am_zoom 1.2 567 | mwheeldown=am_zoom -1.2 568 | 569 | -------------------------------------------------------------------------------- /algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__init__.py -------------------------------------------------------------------------------- /algorithms/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__init__.pyc -------------------------------------------------------------------------------- /algorithms/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/__pycache__/actor_critic_utilities.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/actor_critic_utilities.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/__pycache__/dql_utilities.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/dql_utilities.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/__pycache__/order_book_data.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/order_book_data.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/__pycache__/trading.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/__pycache__/trading.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/actor_critic_utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Mar 16 06:54:29 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import time, tensorflow as tf, numpy as np 10 | from baselines.common.runners import AbstractEnvRunner 11 | from baselines.common import explained_variance 12 | 13 | def mse(pred, target): 14 | return tf.square(pred-target)/2. 15 | 16 | def find_trainable_variables(key): 17 | with tf.variable_scope(key): 18 | return tf.trainable_variables() 19 | 20 | def swap_flatten_axes(array): 21 | return arrary.swapaxes(0, 1).reshape(array.shape[0] * array.shape[1], * array.shape[2:]) 22 | 23 | class Model(object): 24 | 25 | def __init__(self, session, policy_model, observation_space, action_space, n_environments, 26 | n_steps, entropy_coefficient, value_coefficient, max_grad_norm): 27 | 28 | session.run(tf.global_variables_initializer()) 29 | actions_ = tf.placeholder(tf.int32, [None], name='actions') 30 | advantages_ = tf.placeholder(tf.float32, [None], name='advantages') 31 | rewards_ = tf.placeholder(tf.float32, [None], name='rewards') 32 | learning_rate = tf.placeholder(tf.float32, name='learning_rate') 33 | step_model = policy_model(session, observation_space, action_space, n_environments, 1, reuse=False) 34 | train_model = policy_model(session, observation_space, action_space, n_environments*n_steps, n_steps, reuse=tf.AUTO_REUSE) 35 | 36 | error_rate = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.logits, labels=actions_) 37 | mean_squared_error = tf.reduce_mean(advantages_ * error_rate) 38 | 39 | value_loss = tf.reduce_mean(mse(tf.squeeze(train_model.value_function),rewards_)) 40 | entropy = tf.reduce_mean(train_model.distribution.entropy()) 41 | loss = mean_squared_error - entropy * entropy_coefficient + value_loss * value_coefficient 42 | 43 | params = find_trainable_variables('model') 44 | gradients = tf.gradients(loss, params) 45 | if max_grad_norm is not None: 46 | gradients, grad_norm = tf.clip_by_global_norm(gradients, max_grad_norm) 47 | 48 | gradients = list(zip(gradients, params)) 49 | trainer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.99, epsilon=1e-5) 50 | _train = trainer.apply_gradients(gradients) 51 | 52 | def train(states_in, actions, returns, values, learning_rate): 53 | advantages = returns - values 54 | 55 | dictionary = {train_model.inputs_: states_in, 56 | actions_: actions, 57 | advantages_: advantages, 58 | rewards_: returns, 59 | learning_rate: learning_rate} 60 | 61 | with tf.Session() as session: 62 | _policy_loss, _value_loss, _policy_entropy, _= session.run([mean_squared_error, 63 | value_loss, 64 | entropy, 65 | _train], dictionary) 66 | return _policy_loss, _value_loss, _policy_entropy 67 | 68 | def save(save_path): 69 | saver = tf.train.Saver() 70 | saver.save(session, save_path) 71 | 72 | def load(load_path): 73 | saver = tf.train.Saver() 74 | print('Loading ' + load_path) 75 | saver.restore(session, load_path) 76 | 77 | self.train = train 78 | self.train_model = train_model 79 | self.step_model = step_model 80 | self.step = step_model.step 81 | self.value = step_model.value 82 | self.initial_state = step_model.initial_state 83 | self.save = save 84 | self.load = load 85 | tf.global_variables_initializer().run(session=tf.Session()) 86 | 87 | class ModelTrainer(AbstractEnvRunner): 88 | 89 | def __init__(self, environment, model, n_steps, n_timesteps, gamma, _lambda): 90 | self.environment = environment 91 | self.model = model 92 | self.n_steps = n_steps 93 | self.gamma = gamma 94 | self._lambda = _lambda 95 | self.n_timesteps = n_timesteps 96 | self.observations = environment.reset() 97 | self.dones = False 98 | 99 | def step(self): 100 | 101 | _observations, _actions, _rewards, _values, _dones = [],[],[],[],[] 102 | 103 | for _ in range(self.n_steps): 104 | actions, values = self.model.step(self.observations, self.dones) 105 | _observations.append(np.copy(self.observations)) 106 | _actions.append(actions) 107 | _values.append(values) 108 | _dones.append(self.dones) 109 | if self.dones: self.environment.reset() 110 | 111 | for action in actions: 112 | self.environment.render() 113 | self.observations[:], rewards, self.dones, _ = self.environment.step(action) 114 | _rewards.append(rewards) 115 | 116 | #batch of steps to batch of rollouts 117 | _observations = np.asarray(_observations, dtype=np.uint8) 118 | _rewards = np.asarray(_rewards, dtype=np.float32) 119 | _actions = np.asarray(_actions, dtype=np.int32) 120 | _values = np.asarray(_values, dtype=np.float32) 121 | _dones = np.asarray(_dones, dtype=np.bool) 122 | last_values = self.model.value(self.observations) 123 | _returns = np.zeros_like(_rewards) 124 | _advantages = np.zeros_like(_rewards) 125 | last_lambda = 0 126 | 127 | for t in reversed(range(self.n_steps)): 128 | if t == self.nsteps - 1: 129 | next_nonterminal = 1.0 - self.dones 130 | next_values = last_values 131 | else: 132 | next_nonterminal = 1.0 - _dones[t+1] 133 | next_values = _values[t+1] 134 | 135 | delta = _rewards[t] + self.gamma * nextvalues * nextnonterminal - _values[t] 136 | _advantages[t] = last_lambda = delta + self.gamma * self._lambda * nextnonterminal * last_lambda 137 | 138 | _returns = _advantages + _values 139 | return map(swap_flatten_axes, (_observations, _actions, _returns, _values)) 140 | 141 | 142 | def train_model(policy_model, environment, n_steps, max_steps, gamma, _lambda, 143 | value_coefficient, entropy_coefficient, learning_rate, max_grad_norm, log_interval): 144 | 145 | n_epochs = 4 146 | n_batches = 8 147 | n_environments = 1 #environment.num_envs 148 | observation_space = environment.observation_space 149 | action_space = environment.action_space 150 | batch_size = n_environments * n_steps 151 | batch_train_size = batch_size // n_batches 152 | assert batch_size % n_batches == 0 153 | session = tf.Session() 154 | 155 | model = Model(session=session, 156 | policy_model=policy_model, 157 | observation_space=observation_space, 158 | action_space=action_space, 159 | n_environments=1, 160 | n_steps=1, 161 | entropy_coefficient=0, 162 | value_coefficient=0, 163 | max_grad_norm=0) 164 | 165 | model_trainer = ModelTrainer(environment=environment, 166 | model=model, 167 | n_steps=n_steps, 168 | n_timesteps=max_steps, 169 | gamma=gamma, 170 | _lambda=_lambda) 171 | 172 | initial_start_time = time.time() 173 | 174 | 175 | for update in range(1, max_steps//batch_size+1): 176 | 177 | timer_start = time.time() 178 | observations, actions, returns, values = model_trainer.step() 179 | mb_losses = [] 180 | total_batches_train = 0 181 | indices = np.arange(batch_size) 182 | 183 | for _ in range(n_epochs): 184 | np.random.shuffle(indices) 185 | for start in range(0, batch_size, batch_train_size): 186 | end = start + batch_train_size 187 | mbinds = indices[start:end] 188 | slices = (arr[mbinds] for arr in (obs, actions, returns, values)) 189 | mb_losses.append(model.train(*slices, lr)) 190 | 191 | loss = np.mean(mb_losses, axis=0) 192 | frames_per_second = int(batch_size / (time.time() - initial_start_time)) 193 | 194 | if update % log_interval == 0 or update == 1: 195 | 196 | """ 197 | Computes fraction of variance that ypred explains about y. 198 | Returns 1 - Var[y-ypred] / Var[y] 199 | interpretation: 200 | explained_variance = 0 => might as well have predicted zero 201 | explained_variance = 1 => perfect prediction 202 | explained_variance < 0 => worse than just predicting zero 203 | """ 204 | _explained_variance = explained_variance(values, returns) 205 | logger.record_tabular("nupdates", update) 206 | logger.record_tabular("total_timesteps", update*batch_size) 207 | logger.record_tabular("fps", frames_per_second) 208 | logger.record_tabular("policy_loss", float(loss[0])) 209 | logger.record_tabular("policy_entropy", float(loss[2])) 210 | logger.record_tabular("value_loss", float(loss[1])) 211 | logger.record_tabular("explained_variance", float(_explained_variance)) 212 | logger.record_tabular("time elapsed", float(time.time() - initial_start_time)) 213 | logger.dump_tabular() 214 | 215 | savepath = "./models/" + str(update) + "/model.ckpt" 216 | model.save(savepath) 217 | print('Saving to', savepath) 218 | 219 | environment.close() 220 | return model 221 | -------------------------------------------------------------------------------- /algorithms/distributions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jun 18 15:41:31 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import tensorflow as tf 10 | import numpy as np 11 | from baselines.a2c.utils import fc 12 | from tensorflow.python.ops import math_ops 13 | #import baselines.common.tf_util as U 14 | 15 | class Pd(object): 16 | """ 17 | A particular probability distribution 18 | """ 19 | def flatparam(self): 20 | raise NotImplementedError 21 | def mode(self): 22 | raise NotImplementedError 23 | def neglogp(self, x): 24 | # Usually it's easier to define the negative logprob 25 | raise NotImplementedError 26 | def kl(self, other): 27 | raise NotImplementedError 28 | def entropy(self): 29 | raise NotImplementedError 30 | def sample(self): 31 | raise NotImplementedError 32 | def logp(self, x): 33 | return - self.neglogp(x) 34 | def get_shape(self): 35 | return self.flatparam().shape 36 | @property 37 | def shape(self): 38 | return self.get_shape() 39 | def __getitem__(self, idx): 40 | return self.__class__(self.flatparam()[idx]) 41 | 42 | class PdType(object): 43 | """ 44 | Parametrized family of probability distributions 45 | """ 46 | def pdclass(self): 47 | raise NotImplementedError 48 | def pdfromflat(self, flat): 49 | return self.pdclass()(flat) 50 | def pdfromlatent(self, latent_vector, init_scale, init_bias): 51 | raise NotImplementedError 52 | def param_shape(self): 53 | raise NotImplementedError 54 | def sample_shape(self): 55 | raise NotImplementedError 56 | def sample_dtype(self): 57 | raise NotImplementedError 58 | 59 | def param_placeholder(self, prepend_shape, name=None): 60 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) 61 | def sample_placeholder(self, prepend_shape, name=None): 62 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) 63 | 64 | def __eq__(self, other): 65 | return (type(self) == type(other)) and (self.__dict__ == other.__dict__) 66 | 67 | class CategoricalPdType(PdType): 68 | def __init__(self, ncat): 69 | self.ncat = ncat 70 | def pdclass(self): 71 | return CategoricalPd 72 | def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): 73 | pdparam = _matching_fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias) 74 | return self.pdfromflat(pdparam), pdparam 75 | 76 | def param_shape(self): 77 | return [self.ncat] 78 | def sample_shape(self): 79 | return [] 80 | def sample_dtype(self): 81 | return tf.int32 82 | 83 | 84 | class MultiCategoricalPdType(PdType): 85 | def __init__(self, nvec): 86 | self.ncats = nvec.astype('int32') 87 | assert (self.ncats > 0).all() 88 | def pdclass(self): 89 | return MultiCategoricalPd 90 | def pdfromflat(self, flat): 91 | return MultiCategoricalPd(self.ncats, flat) 92 | 93 | def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0): 94 | pdparam = _matching_fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias) 95 | return self.pdfromflat(pdparam), pdparam 96 | 97 | def param_shape(self): 98 | return [sum(self.ncats)] 99 | def sample_shape(self): 100 | return [len(self.ncats)] 101 | def sample_dtype(self): 102 | return tf.int32 103 | 104 | class DiagGaussianPdType(PdType): 105 | def __init__(self, size): 106 | self.size = size 107 | def pdclass(self): 108 | return DiagGaussianPd 109 | 110 | def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): 111 | mean = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) 112 | logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) 113 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) 114 | return self.pdfromflat(pdparam), mean 115 | 116 | def param_shape(self): 117 | return [2*self.size] 118 | def sample_shape(self): 119 | return [self.size] 120 | def sample_dtype(self): 121 | return tf.float32 122 | 123 | class BernoulliPdType(PdType): 124 | def __init__(self, size): 125 | self.size = size 126 | def pdclass(self): 127 | return BernoulliPd 128 | def param_shape(self): 129 | return [self.size] 130 | def sample_shape(self): 131 | return [self.size] 132 | def sample_dtype(self): 133 | return tf.int32 134 | def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): 135 | pdparam = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) 136 | return self.pdfromflat(pdparam), pdparam 137 | 138 | # WRONG SECOND DERIVATIVES 139 | # class CategoricalPd(Pd): 140 | # def __init__(self, logits): 141 | # self.logits = logits 142 | # self.ps = tf.nn.softmax(logits) 143 | # @classmethod 144 | # def fromflat(cls, flat): 145 | # return cls(flat) 146 | # def flatparam(self): 147 | # return self.logits 148 | # def mode(self): 149 | # return U.argmax(self.logits, axis=-1) 150 | # def logp(self, x): 151 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) 152 | # def kl(self, other): 153 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ 154 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 155 | # def entropy(self): 156 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 157 | # def sample(self): 158 | # u = tf.random_uniform(tf.shape(self.logits)) 159 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) 160 | 161 | class CategoricalPd(Pd): 162 | def __init__(self, logits): 163 | self.logits = logits 164 | def flatparam(self): 165 | return self.logits 166 | def mode(self): 167 | return tf.argmax(self.logits, axis=-1) 168 | 169 | @property 170 | def mean(self): 171 | return tf.nn.softmax(self.logits) 172 | def neglogp(self, x): 173 | # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 174 | # Note: we can't use sparse_softmax_cross_entropy_with_logits because 175 | # the implementation does not allow second-order derivatives... 176 | if x.dtype in {tf.uint8, tf.int32, tf.int64}: 177 | # one-hot encoding 178 | x_shape_list = x.shape.as_list() 179 | logits_shape_list = self.logits.get_shape().as_list()[:-1] 180 | for xs, ls in zip(x_shape_list, logits_shape_list): 181 | if xs is not None and ls is not None: 182 | assert xs == ls, 'shape mismatch: {} in x vs {} in logits'.format(xs, ls) 183 | 184 | x = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) 185 | else: 186 | # already encoded 187 | assert x.shape.as_list() == self.logits.shape.as_list() 188 | 189 | return tf.nn.softmax_cross_entropy_with_logits_v2( 190 | logits=self.logits, 191 | labels=x) 192 | def kl(self, other): 193 | a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) 194 | a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True) 195 | ea0 = tf.exp(a0) 196 | ea1 = tf.exp(a1) 197 | z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) 198 | z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True) 199 | p0 = ea0 / z0 200 | return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) 201 | def entropy(self): 202 | a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) 203 | ea0 = tf.exp(a0) 204 | z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) 205 | p0 = ea0 / z0 206 | return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1) 207 | def sample(self): 208 | u = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype) 209 | return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) 210 | @classmethod 211 | def fromflat(cls, flat): 212 | return cls(flat) 213 | 214 | class MultiCategoricalPd(Pd): 215 | def __init__(self, nvec, flat): 216 | self.flat = flat 217 | self.categoricals = list(map(CategoricalPd, 218 | tf.split(flat, np.array(nvec, dtype=np.int32), axis=-1))) 219 | def flatparam(self): 220 | return self.flat 221 | def mode(self): 222 | return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) 223 | def neglogp(self, x): 224 | return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))]) 225 | def kl(self, other): 226 | return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)]) 227 | def entropy(self): 228 | return tf.add_n([p.entropy() for p in self.categoricals]) 229 | def sample(self): 230 | return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) 231 | @classmethod 232 | def fromflat(cls, flat): 233 | raise NotImplementedError 234 | 235 | class DiagGaussianPd(Pd): 236 | def __init__(self, flat): 237 | self.flat = flat 238 | mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat) 239 | self.mean = mean 240 | self.logstd = logstd 241 | self.std = tf.exp(logstd) 242 | def flatparam(self): 243 | return self.flat 244 | def mode(self): 245 | return self.mean 246 | def neglogp(self, x): 247 | return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ 248 | + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \ 249 | + tf.reduce_sum(self.logstd, axis=-1) 250 | def kl(self, other): 251 | assert isinstance(other, DiagGaussianPd) 252 | return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1) 253 | def entropy(self): 254 | return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) 255 | def sample(self): 256 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) 257 | @classmethod 258 | def fromflat(cls, flat): 259 | return cls(flat) 260 | 261 | 262 | class BernoulliPd(Pd): 263 | def __init__(self, logits): 264 | self.logits = logits 265 | self.ps = tf.sigmoid(logits) 266 | def flatparam(self): 267 | return self.logits 268 | @property 269 | def mean(self): 270 | return self.ps 271 | def mode(self): 272 | return tf.round(self.ps) 273 | def neglogp(self, x): 274 | return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1) 275 | def kl(self, other): 276 | return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) 277 | def entropy(self): 278 | return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) 279 | def sample(self): 280 | u = tf.random_uniform(tf.shape(self.ps)) 281 | return tf.to_float(math_ops.less(u, self.ps)) 282 | @classmethod 283 | def fromflat(cls, flat): 284 | return cls(flat) 285 | 286 | def make_pdtype(ac_space): 287 | from gym import spaces 288 | if isinstance(ac_space, spaces.Box): 289 | assert len(ac_space.shape) == 1 290 | return DiagGaussianPdType(ac_space.shape[0]) 291 | elif isinstance(ac_space, spaces.Discrete): 292 | return CategoricalPdType(ac_space.n) 293 | elif isinstance(ac_space, spaces.MultiDiscrete): 294 | return MultiCategoricalPdType(ac_space.nvec) 295 | elif isinstance(ac_space, spaces.MultiBinary): 296 | return BernoulliPdType(ac_space.n) 297 | else: 298 | raise NotImplementedError 299 | 300 | def shape_el(v, i): 301 | maybe = v.get_shape()[i] 302 | if maybe is not None: 303 | return maybe 304 | else: 305 | return tf.shape(v)[i] 306 | 307 | ''' 308 | @U.in_session 309 | def test_probtypes(): 310 | np.random.seed(0) 311 | 312 | pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8]) 313 | diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101 314 | validate_probtype(diag_gauss, pdparam_diag_gauss) 315 | 316 | pdparam_categorical = np.array([-.2, .3, .5]) 317 | categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101 318 | validate_probtype(categorical, pdparam_categorical) 319 | 320 | nvec = [1,2,3] 321 | pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1]) 322 | multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101 323 | validate_probtype(multicategorical, pdparam_multicategorical) 324 | 325 | pdparam_bernoulli = np.array([-.2, .3, .5]) 326 | bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101 327 | validate_probtype(bernoulli, pdparam_bernoulli) 328 | 329 | 330 | def validate_probtype(probtype, pdparam): 331 | N = 100000 332 | # Check to see if mean negative log likelihood == differential entropy 333 | Mval = np.repeat(pdparam[None, :], N, axis=0) 334 | M = probtype.param_placeholder([N]) 335 | X = probtype.sample_placeholder([N]) 336 | pd = probtype.pdfromflat(M) 337 | calcloglik = U.function([X, M], pd.logp(X)) 338 | calcent = U.function([M], pd.entropy()) 339 | Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) 340 | logliks = calcloglik(Xval, Mval) 341 | entval_ll = - logliks.mean() #pylint: disable=E1101 342 | entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 343 | entval = calcent(Mval).mean() #pylint: disable=E1101 344 | assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas 345 | 346 | # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] 347 | M2 = probtype.param_placeholder([N]) 348 | pd2 = probtype.pdfromflat(M2) 349 | q = pdparam + np.random.randn(pdparam.size) * 0.1 350 | Mval2 = np.repeat(q[None, :], N, axis=0) 351 | calckl = U.function([M, M2], pd.kl(pd2)) 352 | klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 353 | logliks = calcloglik(Xval, Mval2) 354 | klval_ll = - entval - logliks.mean() #pylint: disable=E1101 355 | klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 356 | assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas 357 | print('ok on', probtype, pdparam) 358 | ''' 359 | 360 | def _matching_fc(tensor, name, size, init_scale, init_bias): 361 | if tensor.shape[-1] == size: 362 | return tensor 363 | else: 364 | return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias) 365 | -------------------------------------------------------------------------------- /algorithms/distributions.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/distributions.pyc -------------------------------------------------------------------------------- /algorithms/dql_utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Mar 18 10:59:09 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import numpy as np 10 | from skimage import transform 11 | from collections import deque 12 | from vizdoom import * 13 | 14 | class Memory(): 15 | 16 | def __init__(self, max_size): 17 | self.buffer = deque(maxlen = max_size) 18 | 19 | def add(self, experience): 20 | self.buffer.append(experience) 21 | 22 | def sample(self, batch_size): 23 | buffer_size = len(self.buffer) 24 | index = np.random.choice(np.arange(buffer_size), 25 | size=batch_size, 26 | replace=True) 27 | 28 | return [self.buffer[i] for i in index] 29 | 30 | def create_environment(filepath='/Users/tawehbeysolow/Desktop/applied_rl_python/chapter3/'): 31 | game = DoomGame() 32 | game.load_config(filepath+'basic.cfg') 33 | game.set_doom_scenario_path(filepath+'basic.wad') 34 | game.init() 35 | 36 | left = [1, 0, 0] 37 | right = [0, 1, 0] 38 | shoot = [0, 0, 1] 39 | possible_actions = [left, right, shoot] 40 | return game, possible_actions 41 | 42 | 43 | def preprocess_frame(frame): 44 | cropped_frame = frame[30:-10,30:-30] 45 | normalized_frame = cropped_frame/float(255) 46 | preprocessed_frame = transform.resize(normalized_frame, [84,84]) 47 | return preprocessed_frame 48 | 49 | def stack_frames(stacked_frames, state, new_episode, stack_size=4): 50 | 51 | frame = preprocess_frame(state) 52 | 53 | if new_episode == True: 54 | 55 | stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4) 56 | for i in range(4): 57 | stacked_frames.append(frame) 58 | 59 | stacked_state = np.stack(stacked_frames, axis=2) 60 | 61 | else: 62 | 63 | stacked_frames.append(frame) 64 | stacked_state = np.stack(stacked_frames, axis=2) 65 | 66 | return stacked_state, stacked_frames -------------------------------------------------------------------------------- /algorithms/dql_utilities.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/dql_utilities.pyc -------------------------------------------------------------------------------- /algorithms/order_book_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Mar 25 15:56:09 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | from tgym.core import DataGenerator 10 | import numpy as np, csv 11 | 12 | def remove_non_ascii(obj): 13 | return ''.join([character for character in obj if ord(character) < 128]) 14 | 15 | class bid_ask_data(DataGenerator): 16 | 17 | def __init__(self, **gen_kwargs): 18 | """Initialisation function. The API (gen_kwargs) should be defined in 19 | the function _generator. 20 | """ 21 | self._trainable = False 22 | self.gen_kwargs = gen_kwargs 23 | DataGenerator.rewind(self) 24 | self.n_products = 1 25 | DataGenerator.rewind(self) 26 | 27 | @staticmethod 28 | def _generator(): 29 | 30 | with open('/Users/tawehbeysolow/Downloads/amazon_order_book_data.csv', 'rU') as csvfile: 31 | reader = csv.reader(csvfile) 32 | for row in reader: 33 | row = [float(remove_non_ascii(_row))/ for _row in row] 34 | yield np.array(row, dtype=np.float) 35 | 36 | def _iterator_end(self): 37 | """Rewinds if end of data reached. 38 | """ 39 | print "End of data reached, rewinding." 40 | super(self.__class__, self).rewind() 41 | 42 | 43 | def next(self): 44 | """Return the next element in the generator. 45 | Args: 46 | numpy.array: next row of the generator 47 | """ 48 | try: 49 | return next(self.generator) 50 | except StopIteration as e: 51 | self._iterator_end() 52 | raise(e) 53 | 54 | def rewind(self): 55 | """Rewind the generator. 56 | """ 57 | self.generator = self._generator() 58 | 59 | 60 | if __name__ == '__main__': 61 | 62 | 63 | generator = bid_ask_data(filename='amazon_order_book_data.csv', filepath='/Users/tawehbeysolow/Downloads/') 64 | prices_time_series = [next(generator.preprocess()) for _ in range(100)] 65 | import pdb; pdb.set_trace() 66 | -------------------------------------------------------------------------------- /algorithms/order_book_data.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/order_book_data.pyc -------------------------------------------------------------------------------- /algorithms/policy_gradient_utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Mar 16 06:54:29 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import tensorflow as tf, numpy as np 10 | from baselines.a2c.utils import cat_entropy, mse 11 | 12 | class Model(object): 13 | """ 14 | We use this object to : 15 | __init__: 16 | - Creates the step_model 17 | - Creates the train_model 18 | train(): 19 | - Make the training part (feedforward and retropropagation of gradients) 20 | save/load(): 21 | - Save load the model 22 | """ 23 | def __init__(self, 24 | policy, 25 | ob_space, 26 | action_space, 27 | nenvs, 28 | nsteps, 29 | ent_coef, 30 | vf_coef, 31 | max_grad_norm): 32 | 33 | sess = tf.get_default_session() 34 | 35 | # Here we create the placeholders 36 | actions_ = tf.placeholder(tf.int32, [None], name="actions_") 37 | advantages_ = tf.placeholder(tf.float32, [None], name="advantages_") 38 | rewards_ = tf.placeholder(tf.float32, [None], name="rewards_") 39 | lr_ = tf.placeholder(tf.float32, name="learning_rate_") 40 | 41 | # Here we create our two models: 42 | # Step_model that is used for sampling 43 | step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) 44 | 45 | # Train model for training 46 | train_model = policy(sess, ob_space, action_space, nenvs*nsteps, nsteps, reuse=True) 47 | 48 | """ 49 | Calculate the loss 50 | Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss 51 | """ 52 | # Policy loss 53 | # Output -log(pi) 54 | neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_) 55 | 56 | # 1/n * sum A(si,ai) * -logpi(ai|si) 57 | pg_loss = tf.reduce_mean(advantages_ * neglogpac) 58 | 59 | # Value loss 1/2 SUM [R - V(s)]^2 60 | vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf),rewards_)) 61 | 62 | # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. 63 | entropy = tf.reduce_mean(train_model.pd.entropy()) 64 | 65 | 66 | loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef 67 | 68 | # Update parameters using loss 69 | # 1. Get the model parameters 70 | params = find_trainable_variables("model") 71 | 72 | # 2. Calculate the gradients 73 | gradients = tf.gradients(loss, params) 74 | if max_grad_norm is not None: 75 | # Clip the gradients (normalize) 76 | gradients, grad_norm = tf.clip_by_global_norm(gradients, max_grad_norm) 77 | 78 | gradients = list(zip(gradients, params)) 79 | # zip aggregate each gradient with parameters associated 80 | # For instance zip(ABCD, xyza) => Ax, By, Cz, Da 81 | 82 | # 3. Build our trainer 83 | trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, decay=0.99, epsilon=1e-5) 84 | 85 | # 4. Backpropagation 86 | _train = trainer.apply_gradients(gradients) 87 | 88 | def train(states_in, actions, returns, values, lr): 89 | advantages = returns - values 90 | 91 | # We create the feed dictionary 92 | td_map = {train_model.inputs_: states_in, 93 | actions_: actions, 94 | advantages_: advantages, # Use to calculate our policy loss 95 | rewards_: returns, # Use as a bootstrap for real value 96 | lr_: lr} 97 | 98 | policy_loss, value_loss, policy_entropy, _= sess.run([pg_loss, vf_loss, entropy, _train], td_map) 99 | 100 | return policy_loss, value_loss, policy_entropy 101 | 102 | 103 | def save(save_path): 104 | """ 105 | Save the model 106 | """ 107 | saver = tf.train.Saver() 108 | saver.save(sess, save_path) 109 | 110 | def load(load_path): 111 | """ 112 | Load the model 113 | """ 114 | saver = tf.train.Saver() 115 | print('Loading ' + load_path) 116 | saver.restore(sess, load_path) 117 | 118 | self.train = train 119 | self.train_model = train_model 120 | self.step_model = step_model 121 | self.step = step_model.step 122 | self.value = step_model.value 123 | self.initial_state = step_model.initial_state 124 | self.save = save 125 | self.load = load 126 | tf.global_variables_initializer().run(session=sess) 127 | 128 | class Runner(AbstractEnvRunner): 129 | """ 130 | We use this object to make a mini batch of experiences 131 | 132 | __init__: 133 | - Initialize the runner 134 | run(): 135 | 136 | - Make a mini batch 137 | """ 138 | def __init__(self, env, model, nsteps, total_timesteps, gamma, lam): 139 | super().__init__(env = env, model = model, nsteps = nsteps) 140 | 141 | # Discount rate 142 | self.gamma = gamma 143 | 144 | # Lambda used in GAE (General Advantage Estimation) 145 | self.lam = lam 146 | 147 | # Total timesteps taken 148 | self.total_timesteps = total_timesteps 149 | 150 | def run(self): 151 | # Here, we init the lists that will contain the mb of experiences 152 | mb_obs, mb_actions, mb_rewards, mb_values, mb_dones = [],[],[],[],[] 153 | 154 | # For n in range number of steps 155 | for n in range(self.nsteps): 156 | # Given observations, take action and value (V(s)) 157 | # We already have self.obs because AbstractEnvRunner run self.obs[:] = env.reset() 158 | actions, values = self.model.step(self.obs, self.dones) 159 | 160 | #print("actions runner runner", actions) 161 | 162 | # Append the observations into the mb 163 | mb_obs.append(np.copy(self.obs)) #obs len nenvs (1 step per env) 164 | 165 | # Append the actions taken into the mb 166 | mb_actions.append(actions) 167 | 168 | # Append the values calculated into the mb 169 | mb_values.append(values) 170 | 171 | # Append the dones situations into the mb 172 | mb_dones.append(self.dones) 173 | 174 | # Take actions in env and look the results 175 | self.obs[:], rewards, self.dones, _ = self.env.step(actions) 176 | 177 | mb_rewards.append(rewards) 178 | 179 | #batch of steps to batch of rollouts 180 | mb_obs = np.asarray(mb_obs, dtype=np.uint8) 181 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32) 182 | mb_actions = np.asarray(mb_actions, dtype=np.int32) 183 | mb_values = np.asarray(mb_values, dtype=np.float32) 184 | mb_dones = np.asarray(mb_dones, dtype=np.bool) 185 | last_values = self.model.value(self.obs) 186 | 187 | 188 | ### GENERALIZED ADVANTAGE ESTIMATION 189 | # discount/bootstrap off value fn 190 | # We create mb_returns and mb_advantages 191 | # mb_returns will contain Advantage + value 192 | mb_returns = np.zeros_like(mb_rewards) 193 | mb_advantages = np.zeros_like(mb_rewards) 194 | 195 | lastgaelam = 0 196 | 197 | # From last step to first step 198 | for t in reversed(range(self.nsteps)): 199 | # If t == before last step 200 | if t == self.nsteps - 1: 201 | # If a state is done, nextnonterminal = 0 202 | # In fact nextnonterminal allows us to do that logic 203 | 204 | #if done (so nextnonterminal = 0): 205 | # delta = R - V(s) (because self.gamma * nextvalues * nextnonterminal = 0) 206 | # else (not done) 207 | #delta = R + gamma * V(st+1) 208 | nextnonterminal = 1.0 - self.dones 209 | 210 | # V(t+1) 211 | nextvalues = last_values 212 | else: 213 | nextnonterminal = 1.0 - mb_dones[t+1] 214 | 215 | nextvalues = mb_values[t+1] 216 | 217 | # Delta = R(st) + gamma * V(t+1) * nextnonterminal - V(st) 218 | delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] 219 | 220 | # Advantage = delta + gamma * λ (lambda) * nextnonterminal * lastgaelam 221 | mb_advantages[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam 222 | 223 | # Returns 224 | mb_returns = mb_advantages + mb_values 225 | 226 | return map(swap_flatten_axes, (mb_obs, mb_actions, mb_returns, mb_values)) 227 | 228 | 229 | def swap_flatten_axes(arr): 230 | """ 231 | swap and then flatten axes 0 and 1 232 | """ 233 | s = arr.shape 234 | return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) 235 | -------------------------------------------------------------------------------- /algorithms/sarsa_algorithm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Mar 8 13:16:38 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | from collections import defaultdict 10 | import numpy as np 11 | 12 | class EligibilityTrace(object): 13 | """class containing logic for SARSA-lambda eligibility traces 14 | this is basically a wrapper for a dict that 15 | 1) clips its values to lie in the interval [0, 1] 16 | 2) updates all values by a decay constant and throws out those 17 | that fall below some threshold 18 | """ 19 | def __init__(self, decay, threshold): 20 | self.decay = decay 21 | self.threshold = threshold 22 | self.data = defaultdict(float) 23 | 24 | def __getitem__(self, key): 25 | return self.data[key] 26 | 27 | def __setitem__(self, key, val): 28 | self.data[key] = np.clip(val, 0, 1) 29 | 30 | def iteritems(self): 31 | return self.data.iteritems() 32 | 33 | def update(self): 34 | for key in self.data.keys(): 35 | if self.data[key] < self.threshold: 36 | del self.data[key] 37 | else: 38 | self.data[key] = self.data[key] * self.decay 39 | 40 | 41 | class SARSA(Agent): 42 | """impementation of SARSA lambda algorithm. 43 | class SARSA is equivilant to this with lambda = 0, but 44 | we seperate the two out because 45 | 1) it's nice to juxtapose the two algorithms side-by-side 46 | 2) SARSA lambda incurrs the overhead of maintaining 47 | eligibility traces 48 | note that the algorithm isn't explicitly parameterized with lambda. 49 | instead, we provide a decay rate and threshold. On each iteration, 50 | the decay is applied all rewards in the eligibility trace. Those 51 | past rewards who have decayed below the threshold are dropped 52 | """ 53 | def __init__(self, featureExtractor, max_gradient, epsilon=0.5, gamma=0.993, stepSize=None, threshold=0.1, decay=0.98): 54 | super(SARSA, self).__init__(featureExtractor, epsilon, gamma, stepSize, max_gradient) 55 | self.eligibility_trace = EligibilityTrace(decay, threshold) 56 | 57 | def update_q_matrix(self, state, action, reward, newState): 58 | """performs a SARSA update. Leverages the eligibility trace to update 59 | parameters towards sum of discounted rewards 60 | """ 61 | self.eligibility_trace.update() 62 | prediction = self.getQ(state, action) 63 | newAction = None 64 | target = reward 65 | for f, v in self.featureExtractor.get_features(state, action).iteritems(): 66 | self.eligibility_trace[f] += v 67 | 68 | if newState != None: 69 | newAction = self.takeAction(newState) 70 | target += self.discount * self.getQ(newState, newAction) 71 | 72 | update = self.getStepSize(self.numIters) * (prediction - target) 73 | # clip gradient - TODO EXPORT TO UTILS? 74 | update = max(-self.max_gradient, update) if update < 0 else min(self.max_gradient, update) 75 | 76 | for key, eligibility in self.eligibility_trace.iteritems(): 77 | self.weights[key] -= update * eligibility 78 | return newAction 79 | 80 | 81 | -------------------------------------------------------------------------------- /algorithms/trading.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jun 18 15:41:31 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import matplotlib as mpl 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | from tgym.core import Env 13 | from tgym.utils import calc_spread 14 | 15 | plt.style.use('dark_background') 16 | mpl.rcParams.update( 17 | { 18 | "font.size": 15, 19 | "axes.labelsize": 15, 20 | "lines.linewidth": 1, 21 | "lines.markersize": 8 22 | } 23 | ) 24 | 25 | 26 | class SpreadTrading(Env): 27 | """Class for a discrete (buy/hold/sell) spread trading environment. 28 | """ 29 | 30 | _actions = { 31 | 'hold': np.array([1, 0, 0]), 32 | 'buy': np.array([0, 1, 0]), 33 | 'sell': np.array([0, 0, 1]) 34 | } 35 | 36 | _positions = { 37 | 'flat': np.array([1, 0, 0]), 38 | 'long': np.array([0, 1, 0]), 39 | 'short': np.array([0, 0, 1]) 40 | } 41 | 42 | def __init__(self, data_generator, spread_coefficients, episode_length=1000, trading_fee=0, time_fee=0, history_length=2): 43 | """Initialisation function 44 | 45 | Args: 46 | data_generator (tgym.core.DataGenerator): A data 47 | generator object yielding a 1D array of bid-ask prices. 48 | spread_coefficients (list): A list of signed integers defining 49 | how much of each product to buy (positive) or sell (negative) 50 | when buying or selling the spread. 51 | episode_length (int): number of steps to play the game for 52 | trading_fee (float): penalty for trading 53 | time_fee (float): time fee 54 | history_length (int): number of historical states to stack in the 55 | observation vector. 56 | """ 57 | 58 | assert data_generator.n_products == len(spread_coefficients) 59 | assert history_length > 0 60 | self._data_generator = data_generator 61 | self._spread_coefficients = spread_coefficients 62 | self._first_render = True 63 | self._trading_fee = trading_fee 64 | self._time_fee = time_fee 65 | self._episode_length = episode_length 66 | self.n_actions = 3 67 | self._prices_history = [] 68 | self._history_length = history_length 69 | self.reset() 70 | 71 | def reset(self): 72 | """Reset the trading environment. Reset rewards, data generator... 73 | 74 | Returns: 75 | observation (numpy.array): observation of the state 76 | """ 77 | self._iteration = 0 78 | self._data_generator.rewind() 79 | self._total_reward = 0 80 | self._total_pnl = 0 81 | self._position = self._positions['flat'] 82 | self._entry_price = 0 83 | self._exit_price = 0 84 | self._closed_plot = False 85 | 86 | for i in range(self._history_length): 87 | self._prices_history.append(self._data_generator.next()) 88 | 89 | observation = self._get_observation() 90 | self.state_shape = observation.shape 91 | self._action = self._actions['hold'] 92 | return observation 93 | 94 | def step(self, action): 95 | """Take an action (buy/sell/hold) and computes the immediate reward. 96 | 97 | Args: 98 | action (numpy.array): Action to be taken, one-hot encoded. 99 | 100 | Returns: 101 | tuple: 102 | - observation (numpy.array): Agent's observation of the current environment. 103 | - reward (float) : Amount of reward returned after previous action. 104 | - done (bool): Whether the episode has ended, in which case further step() calls will return undefined results. 105 | - info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning). 106 | 107 | """ 108 | 109 | assert any([(action == x).all() for x in self._actions.values()]) 110 | self._action = action 111 | self._iteration += 1 112 | done = False 113 | instant_pnl = 0 114 | info = {} 115 | reward = -self._time_fee 116 | if all(action == self._actions['buy']): 117 | reward -= self._trading_fee 118 | if all(self._position == self._positions['flat']): 119 | self._position = self._positions['long'] 120 | self._entry_price = calc_spread( 121 | self._prices_history[-1], self._spread_coefficients)[1] # Ask 122 | elif all(self._position == self._positions['short']): 123 | self._exit_price = calc_spread( 124 | self._prices_history[-1], self._spread_coefficients)[1] # Ask 125 | instant_pnl = self._entry_price - self._exit_price 126 | self._position = self._positions['flat'] 127 | self._entry_price = 0 128 | elif all(action == self._actions['sell']): 129 | reward -= self._trading_fee 130 | if all(self._position == self._positions['flat']): 131 | self._position = self._positions['short'] 132 | self._entry_price = calc_spread( 133 | self._prices_history[-1], self._spread_coefficients)[0] # Bid 134 | elif all(self._position == self._positions['long']): 135 | self._exit_price = calc_spread( 136 | self._prices_history[-1], self._spread_coefficients)[0] # Bid 137 | instant_pnl = self._exit_price - self._entry_price 138 | self._position = self._positions['flat'] 139 | self._entry_price = 0 140 | 141 | reward += instant_pnl 142 | self._total_pnl += instant_pnl 143 | self._total_reward += reward 144 | 145 | # Game over logic 146 | try: 147 | self._prices_history.append(self._data_generator.next()) 148 | except StopIteration: 149 | done = True 150 | info['status'] = 'No more data.' 151 | if self._iteration >= self._episode_length: 152 | done = True 153 | info['status'] = 'Time out.' 154 | if self._closed_plot: 155 | info['status'] = 'Closed plot' 156 | 157 | observation = self._get_observation() 158 | return observation, reward, done, info 159 | 160 | def _handle_close(self, evt): 161 | self._closed_plot = True 162 | 163 | def render(self, savefig=False, filename='myfig'): 164 | """Matlplotlib rendering of each step. 165 | 166 | Args: 167 | savefig (bool): Whether to save the figure as an image or not. 168 | filename (str): Name of the image file. 169 | """ 170 | if self._first_render: 171 | self._f, self._ax = plt.subplots( 172 | len(self._spread_coefficients) + int(len(self._spread_coefficients) > 1), 173 | sharex=True 174 | ) 175 | if len(self._spread_coefficients) == 1: 176 | self._ax = [self._ax] 177 | self._f.set_size_inches(12, 6) 178 | self._first_render = False 179 | self._f.canvas.mpl_connect('close_event', self._handle_close) 180 | if len(self._spread_coefficients) > 1: 181 | # TODO: To be checked 182 | for prod_i in range(len(self._spread_coefficients)): 183 | bid = self._prices_history[-1][2 * prod_i] 184 | ask = self._prices_history[-1][2 * prod_i + 1] 185 | self._ax[prod_i].plot([self._iteration, self._iteration + 1], 186 | [bid, bid], color='white') 187 | self._ax[prod_i].plot([self._iteration, self._iteration + 1], 188 | [ask, ask], color='white') 189 | self._ax[prod_i].set_title('Product {} (spread coef {})'.format( 190 | prod_i, str(self._spread_coefficients[prod_i]))) 191 | 192 | # Spread price 193 | prices = self._prices_history[-1] 194 | bid, ask = calc_spread(prices, self._spread_coefficients) 195 | self._ax[-1].plot([self._iteration, self._iteration + 1], 196 | [bid, bid], color='white') 197 | self._ax[-1].plot([self._iteration, self._iteration + 1], 198 | [ask, ask], color='white') 199 | ymin, ymax = self._ax[-1].get_ylim() 200 | yrange = ymax - ymin 201 | if (self._action == self._actions['sell']).all(): 202 | self._ax[-1].scatter(self._iteration + 0.5, bid + 0.03 * 203 | yrange, color='orangered', marker='v') 204 | elif (self._action == self._actions['buy']).all(): 205 | self._ax[-1].scatter(self._iteration + 0.5, ask - 0.03 * 206 | yrange, color='lawngreen', marker='^') 207 | plt.suptitle('Cumulated Reward: ' + "%.2f" % self._total_reward + ' ~ ' + 208 | 'Cumulated PnL: ' + "%.2f" % self._total_pnl + ' ~ ' + 209 | 'Position: ' + ['flat', 'long', 'short'][list(self._position).index(1)] + ' ~ ' + 210 | 'Entry Price: ' + "%.2f" % self._entry_price) 211 | self._f.tight_layout() 212 | plt.xticks(range(self._iteration)[::5]) 213 | plt.xlim([max(0, self._iteration - 80.5), self._iteration + 0.5]) 214 | plt.subplots_adjust(top=0.85) 215 | plt.pause(0.01) 216 | if savefig: 217 | plt.savefig(filename) 218 | 219 | def _get_observation(self): 220 | """Concatenate all necessary elements to create the observation. 221 | 222 | Returns: 223 | numpy.array: observation array. 224 | """ 225 | return np.concatenate( 226 | [prices for prices in self._prices_history[-self._history_length:]] + 227 | [ 228 | np.array([self._entry_price]), 229 | np.array(self._position) 230 | ] 231 | ) 232 | 233 | @staticmethod 234 | def random_action_fun(): 235 | """The default random action for exploration. 236 | We hold 80% of the time and buy or sell 10% of the time each. 237 | 238 | Returns: 239 | numpy.array: array with a 1 on the action index, 0 elsewhere. 240 | """ 241 | return np.random.multinomial(1, [0.8, 0.1, 0.1]) 242 | -------------------------------------------------------------------------------- /algorithms/trading.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/algorithms/trading.pyc -------------------------------------------------------------------------------- /chapter1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter1/__init__.py -------------------------------------------------------------------------------- /chapter1/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter1/__init__.pyc -------------------------------------------------------------------------------- /chapter1/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter1/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /chapter1/__pycache__/open_ai_gym_example.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter1/__pycache__/open_ai_gym_example.cpython-36.pyc -------------------------------------------------------------------------------- /chapter1/open_ai_gym_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jan 28 23:18:17 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import gym 10 | 11 | def cartpole(): 12 | environment = gym.make('CartPole-v1') 13 | environment.reset() 14 | for _ in range(1000): 15 | environment.render() 16 | action = environment.action_space.sample() 17 | observation, reward, done, info = environment.step(action) 18 | print("Step {}:".format(_)) 19 | print("action: {}".format(action)) 20 | print("observation: {}".format(observation)) 21 | print("reward: {}".format(reward)) 22 | print("done: {}".format(done)) 23 | print("info: {}".format(info)) 24 | 25 | if __name__ == '__main__': 26 | 27 | cartpole() -------------------------------------------------------------------------------- /chapter2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/.DS_Store -------------------------------------------------------------------------------- /chapter2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/__init__.py -------------------------------------------------------------------------------- /chapter2/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/__init__.pyc -------------------------------------------------------------------------------- /chapter2/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /chapter2/__pycache__/super_mario_example.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/__pycache__/super_mario_example.cpython-36.pyc -------------------------------------------------------------------------------- /chapter2/cart_pole_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Feb 20 13:50:58 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import gym, numpy as np, matplotlib.pyplot as plt 10 | from neural_networks.policy_gradient_utilities import PolicyGradient 11 | 12 | #Parameters 13 | n_units = 5 14 | gamma = .99 15 | batch_size = 50 16 | learning_rate = 1e-3 17 | n_episodes = 10000 18 | render = False 19 | goal = 190 20 | n_layers = 2 21 | n_classes = 2 22 | environment = gym.make('CartPole-v1') 23 | environment_dimension = len(environment.reset()) 24 | 25 | def calculate_discounted_reward(reward, gamma=gamma): 26 | output = [reward[i] * gamma**i for i in range(0, len(reward))] 27 | return output[::-1] 28 | 29 | def score_model(model, n_tests, render=render): 30 | scores = [] 31 | for _ in range(n_tests): 32 | environment.reset() 33 | observation = environment.reset() 34 | reward_sum = 0 35 | while True: 36 | if render: 37 | environment.render() 38 | 39 | state = np.reshape(observation, [1, environment_dimension]) 40 | predict = model.predict([state])[0] 41 | action = np.argmax(predict) 42 | observation, reward, done, _ = environment.step(action) 43 | reward_sum += reward 44 | if done: 45 | break 46 | scores.append(reward_sum) 47 | 48 | environment.close() 49 | return np.mean(scores) 50 | 51 | def cart_pole_game(environment, policy_model, model_predictions): 52 | loss = [] 53 | n_episode, reward_sum, score, episode_done = 0, 0, 0, False 54 | n_actions = environment.action_space.n 55 | observation = environment.reset() 56 | 57 | states = np.empty(0).reshape(0, environment_dimension) 58 | actions = np.empty(0).reshape(0, 1) 59 | rewards = np.empty(0).reshape(0, 1) 60 | discounted_rewards = np.empty(0).reshape(0, 1) 61 | 62 | while n_episode < n_episodes: 63 | 64 | state = np.reshape(observation, [1, environment_dimension]) 65 | prediction = model_predictions.predict([state])[0] 66 | action = np.random.choice(range(environment.action_space.n), p=prediction) 67 | states = np.vstack([states, state]) 68 | actions = np.vstack([actions, action]) 69 | 70 | observation, reward, episode_done, info = environment.step(action) 71 | reward_sum += reward 72 | rewards = np.vstack([rewards, reward]) 73 | 74 | if episode_done == True: 75 | 76 | discounted_reward = calculate_discounted_reward(rewards) 77 | discounted_rewards = np.vstack([discounted_rewards, discounted_reward]) 78 | rewards = np.empty(0).reshape(0, 1) 79 | 80 | if (n_episode + 1) % batch_size == 0: 81 | 82 | discounted_rewards -= discounted_rewards.mean() 83 | discounted_rewards /= discounted_rewards.std() 84 | discounted_rewards = discounted_rewards.squeeze() 85 | actions = actions.squeeze().astype(int) 86 | 87 | train_actions = np.zeros([len(actions), n_actions]) 88 | train_actions[np.arange(len(actions)), actions] = 1 89 | 90 | error = policy_model.train_on_batch([states, discounted_rewards], train_actions) 91 | loss.append(error) 92 | 93 | states = np.empty(0).reshape(0, environment_dimension) 94 | actions = np.empty(0).reshape(0, 1) 95 | discounted_rewards = np.empty(0).reshape(0, 1) 96 | 97 | score = score_model(model=model_predictions, n_tests=10) 98 | 99 | print('''\nEpisode: %s \nAverage Reward: %s \nScore: %s \nError: %s''' 100 | )%(n_episode+1, reward_sum/float(batch_size), score, np.mean(loss[-batch_size:])) 101 | 102 | if score >= goal: 103 | break 104 | 105 | reward_sum = 0 106 | 107 | n_episode += 1 108 | observation = environment.reset() 109 | 110 | plt.title('Policy Gradient Error plot over %s Episodes'%(n_episode+1)) 111 | plt.xlabel('N batches') 112 | plt.ylabel('Error Rate') 113 | plt.plot(loss) 114 | plt.show() 115 | 116 | if __name__ == '__main__': 117 | 118 | 119 | mlp_model = PolicyGradient(n_units=n_units, 120 | n_layers=n_layers, 121 | n_columns=environment_dimension, 122 | n_outputs=n_classes, 123 | learning_rate=learning_rate, 124 | hidden_activation='selu', 125 | output_activation='softmax', 126 | loss_function='log_likelihood') 127 | 128 | policy_model, model_predictions = mlp_model.create_policy_model(input_shape=(environment_dimension, )) 129 | 130 | policy_model.summary() 131 | 132 | cart_pole_game(environment=environment, 133 | policy_model=policy_model, 134 | model_predictions=model_predictions) 135 | -------------------------------------------------------------------------------- /chapter2/cart_pole_example.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter2/cart_pole_example.pyc -------------------------------------------------------------------------------- /chapter2/super_mario_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Mar 10 21:00:57 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import numpy as np 10 | from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv 11 | import gym_super_mario_bros 12 | from gym_super_mario_bros.actions import SIMPLE_MOVEMENT 13 | from algorithms.actor_critic_utilities import train_model 14 | from neural_networks.models import ActorCriticModel 15 | 16 | #Parameters 17 | environment = gym_super_mario_bros.make('SuperMarioBros-v0') 18 | environment = BinarySpaceToDiscreteSpaceEnv(environment, SIMPLE_MOVEMENT) 19 | observation = environment.reset() 20 | learning_rate = 1e-4 21 | gamma = 0.96 22 | epsilon = 0.9 23 | n_episodes = 10000 24 | n_steps = 2048 25 | max_steps = int(1e7) 26 | _lambda = 0.95 27 | value_coefficient = 0.5 28 | entropy_coefficient = 0.01 29 | max_grad_norm = 0.5 30 | log_interval = 10 31 | 32 | def play_super_mario(model, environment=environment): 33 | 34 | observations = environment.reset() 35 | score, n_step, done = 0, 0, False 36 | scores = [] 37 | 38 | for _ in range(100): 39 | 40 | while done: 41 | 42 | actions, values = model.step(observations) 43 | observations, rewards, done, info = environment.step(actions) 44 | score += rewards 45 | environment.render() 46 | n_step += 1 47 | scores.append(score) 48 | 49 | print('Step: %s \nScore: %s '%(n_step, score)) 50 | environment.reset() 51 | 52 | print(np.mean(scores)) 53 | 54 | if __name__ == '__main__': 55 | 56 | model = train_model(policy_model=ActorCriticModel, 57 | environment=environment, 58 | n_steps=n_steps, 59 | max_steps=max_steps, 60 | gamma=gamma, 61 | _lambda=_lambda, 62 | value_coefficient=value_coefficient, 63 | entropy_coefficient=entropy_coefficient, 64 | learning_rate=learning_rate, 65 | max_grad_norm=max_grad_norm, 66 | log_interval=log_interval) 67 | 68 | play_super_mario(model=model, 69 | environment=environment) -------------------------------------------------------------------------------- /chapter3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__init__.py -------------------------------------------------------------------------------- /chapter3/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__init__.pyc -------------------------------------------------------------------------------- /chapter3/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /chapter3/__pycache__/doom_example.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__pycache__/doom_example.cpython-36.pyc -------------------------------------------------------------------------------- /chapter3/__pycache__/frozen_lake_example.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/__pycache__/frozen_lake_example.cpython-36.pyc -------------------------------------------------------------------------------- /chapter3/basic.cfg: -------------------------------------------------------------------------------- 1 | doom_scenario_path = basic.wad 2 | doom_map = map01 3 | 4 | # Rewards 5 | living_reward = -1 6 | 7 | # Rendering options 8 | screen_resolution = RES_160X120 9 | screen_format = GRAY8 10 | render_hud = True 11 | render_crosshair = false 12 | render_weapon = true 13 | render_decals = true 14 | render_particles = true 15 | window_visible = true 16 | 17 | # make episodes start after 20 tics (after unholstering the gun) 18 | episode_start_time = 14 19 | 20 | # make episodes finish after 300 actions (tics) 21 | episode_timeout = 300 22 | 23 | # Available buttons 24 | available_buttons = 25 | { 26 | MOVE_LEFT 27 | MOVE_RIGHT 28 | ATTACK 29 | } 30 | 31 | # Game variables that will be in the state 32 | available_game_variables = { AMMO2} 33 | 34 | mode = PLAYER 35 | doom_skill = 5 36 | -------------------------------------------------------------------------------- /chapter3/basic.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter3/basic.wad -------------------------------------------------------------------------------- /chapter3/doom_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Mar 18 10:50:31 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import warnings, random, time, tensorflow as tf, numpy as np, matplotlib.pyplot as plt 10 | from neural_networks.models import DeepQNetwork 11 | from algorithms.dql_utilities import create_environment, stack_frames, Memory 12 | from chapter3.frozen_lake_example import exploit_explore 13 | from collections import deque 14 | 15 | #Parameters 16 | stack_size = 4 17 | gamma = 0.95 18 | memory_size = int(1e7) 19 | train = True 20 | episode_render = False 21 | n_units = 500 22 | n_classes = 3 23 | learning_rate = 2e-4 24 | stride = 4 25 | kernel = 8 26 | n_filters = 3 27 | n_episodes = 1 28 | max_steps = 100 29 | batch_size = 64 30 | environment, possible_actions = create_environment() 31 | state_size = [84, 84, 4] 32 | action_size = 3 #environment.get_avaiable_buttons_size() 33 | explore_start = 1.0 34 | explore_stop = 0.01 35 | decay_rate = 1e-4 36 | pretrain_length = batch_size 37 | warnings.filterwarnings('ignore') 38 | #writer = tf.summary.FileWriter("/tensorboard/dqn/1") 39 | write_op = tf.summary.merge_all() 40 | 41 | def exploit_explore(session, model, explore_start, explore_stop, decay_rate, decay_step, state, actions): 42 | exp_exp_tradeoff = np.random.rand() 43 | explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step) 44 | 45 | if (explore_probability > exp_exp_tradeoff): 46 | action = random.choice(possible_actions) 47 | 48 | else: 49 | Qs = session.run(model.output, feed_dict = {model.input_matrix: state.reshape((1, *state.shape))}) 50 | choice = np.argmax(Qs) 51 | action = possible_actions[int(choice)] 52 | 53 | return action, explore_probability 54 | 55 | def train_model(model, environment): 56 | tf.summary.scalar('Loss', model.error_rate) 57 | saver = tf.train.Saver() 58 | stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4) 59 | memory = Memory(max_size=memory_size) 60 | scores = [] 61 | 62 | with tf.Session() as sess: 63 | sess.run(tf.global_variables_initializer()) 64 | decay_step = 0 65 | environment.init() 66 | 67 | for episode in range(n_episodes): 68 | step, reward_sum = 0, [] 69 | environment.new_episode() 70 | state = environment.get_state().screen_buffer 71 | state, stacked_frames = stack_frames(stacked_frames, state, True) 72 | 73 | while step < max_steps: 74 | step += 1; decay_step += 1 75 | 76 | action, explore_probability = exploit_explore(session=sess, 77 | model=model, 78 | explore_start=explore_start, 79 | explore_stop=explore_stop, 80 | decay_rate=decay_rate, 81 | decay_step=decay_step, 82 | state=state, 83 | actions=possible_actions) 84 | 85 | reward = environment.make_action(action) 86 | done = environment.is_episode_finished() 87 | reward_sum.append(reward) 88 | 89 | if done: 90 | 91 | next_state = np.zeros((84,84), dtype=np.int) 92 | 93 | next_state, stacked_frames = stack_frames(stacked_frames=stacked_frames, 94 | state=next_state, 95 | new_episode=False) 96 | step = max_steps 97 | 98 | total_reward = np.sum(reward_sum) 99 | 100 | scores.append(total_reward) 101 | 102 | 103 | print('Episode: {}'.format(episode), 104 | 'Total reward: {}'.format(total_reward), 105 | 'Explore P: {:.4f}'.format(explore_probability)) 106 | 107 | memory.add((state, action, reward, next_state, done)) 108 | 109 | else: 110 | next_state = environment.get_state().screen_buffer 111 | next_state, stacked_frames = stack_frames(stacked_frames, next_state, False) 112 | memory.add((state, action, reward, next_state, done)) 113 | state = next_state 114 | 115 | 116 | batch = memory.sample(batch_size) 117 | states = np.array([each[0] for each in batch], ndmin=3) 118 | actions = np.array([each[1] for each in batch]) 119 | rewards = np.array([each[2] for each in batch]) 120 | next_states = np.array([each[3] for each in batch], ndmin=3) 121 | dones = np.array([each[4] for each in batch]) 122 | 123 | target_Qs_batch = [] 124 | 125 | Qs_next_state = sess.run(model.predicted_Q, feed_dict={model.input_matrix: next_states, model.actions: actions}) 126 | 127 | for i in range(0, len(batch)): 128 | terminal = dones[i] 129 | 130 | if terminal: 131 | target_Qs_batch.append(rewards[i]) 132 | 133 | else: 134 | target = rewards[i] + gamma * np.max(Qs_next_state[i]) 135 | target_Qs_batch.append(target) 136 | 137 | 138 | targets = np.array([each for each in target_Qs_batch]) 139 | 140 | error_rate, _ = sess.run([model.error_rate, model.optimizer], 141 | feed_dict={model.input_matrix: states, 142 | model.target_Q: targets, 143 | model.actions: actions}) 144 | ''' 145 | # Write TF Summaries 146 | summary = sess.run(write_op, feed_dict={model.inputs_: states, 147 | model.target_Q: targets, 148 | model.actions_: actions}) 149 | 150 | writer.add_summary(summary, episode) 151 | writer.flush() 152 | 153 | 154 | if episode % 5 == 0: 155 | #saver.save(sess, filepath+'/models/model.ckpt') 156 | #print("Model Saved") 157 | ''' 158 | 159 | plt.plot(scores) 160 | plt.title('DQN Performance During Training') 161 | plt.xlabel('N Episodes') 162 | plt.ylabel('Score Value') 163 | plt.show() 164 | plt.waitforbuttonpress() 165 | plt.close() 166 | return model 167 | 168 | 169 | def play_doom(model, environment): 170 | 171 | stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4) 172 | scores = [] 173 | 174 | with tf.Session() as sess: 175 | 176 | sess.run(tf.global_variables_initializer()) 177 | totalScore = 0 178 | 179 | for _ in range(100): 180 | 181 | done = False 182 | environment.new_episode() 183 | 184 | state = environment.get_state().screen_buffer 185 | state, stacked_frames = stack_frames(stacked_frames, state, True) 186 | 187 | while not environment.is_episode_finished(): 188 | 189 | Q_matrix = sess.run(model.output, feed_dict = {model.input_matrix: state.reshape((1, *state.shape))}) 190 | choice = np.argmax(Q_matrix) 191 | action = possible_actions[int(choice)] 192 | 193 | environment.make_action(action) 194 | done = environment.is_episode_finished() 195 | score = environment.get_total_reward() 196 | scores.append(score) 197 | time.sleep(0.01) 198 | 199 | if done: 200 | break 201 | 202 | score = environment.get_total_reward() 203 | print("Score: ", score) 204 | 205 | environment.close() 206 | 207 | plt.plot(scores) 208 | plt.title('DQN Performance After Training') 209 | plt.xlabel('N Episodes') 210 | plt.ylabel('Score Value') 211 | plt.show() 212 | plt.waitforbuttonpress() 213 | plt.close() 214 | 215 | if __name__ == '__main__': 216 | 217 | 218 | model = DeepQNetwork(n_units=n_units, 219 | n_classes=n_classes, 220 | n_filters=n_filters, 221 | stride=stride, 222 | kernel=kernel, 223 | state_size=state_size, 224 | action_size=action_size, 225 | learning_rate=learning_rate) 226 | 227 | trained_model = train_model(model=model, 228 | environment=environment) 229 | 230 | play_doom(model=trained_model, 231 | environment=environment) -------------------------------------------------------------------------------- /chapter3/frozen_lake_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Mar 13 00:58:25 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import os, time, gym, numpy as np 10 | 11 | #Parameters 12 | learning_rate = 1e-2 13 | gamma = 0.96 14 | epsilon = 0.9 15 | n_episodes = 10000 16 | max_steps = 100 17 | environment = gym.make('FrozenLake-v0') 18 | Q_matrix = np.zeros((environment.observation_space.n, environment.action_space.n)) 19 | 20 | def choose_action(state): 21 | ''' 22 | To be used after Q table has been updated, returns an action 23 | 24 | Parameters: 25 | 26 | state - int - the current state of the agent 27 | 28 | :return: int 29 | ''' 30 | return np.argmax(Q_matrix[state, :]) 31 | 32 | def exploit_explore(prior_state, epsilon=epsilon, Q_matrix=Q_matrix): 33 | ''' 34 | One half of the exploit-explore paradigm that we will utilize 35 | 36 | Parameters 37 | 38 | prior_state - int - the prior state of the environment at a given iteration 39 | epsilon - float - parameter that we use to determine whether we will try a new or current best action 40 | 41 | :return: int 42 | ''' 43 | 44 | if np.random.uniform(0, 1) < epsilon: 45 | return environment.action_space.sample() 46 | else: 47 | return np.argmax(Q_matrix[prior_state, :]) 48 | 49 | 50 | def update_q_matrix(prior_state, observation , reward, action): 51 | ''' 52 | Algorithm that updates the values in the Q table to reflect knowledge acquired by the agent 53 | 54 | Parameters 55 | 56 | prior_state - int - the prior state of the environment before the current timestemp 57 | observation - int - the current state of the environment 58 | reward - int - the reward yielded from the environment after an action 59 | action - int - the action suggested by the epsilon greedy algorithm 60 | 61 | :return: None 62 | ''' 63 | 64 | prediction = Q_matrix[prior_state, action] 65 | actual_label = reward + gamma * np.max(Q_matrix[observation, :]) 66 | Q_matrix[prior_state, action] = Q_matrix[prior_state, action] + learning_rate*(actual_label - prediction) 67 | 68 | 69 | def populate_q_matrix(render=False, n_episodes=n_episodes): 70 | ''' 71 | Directly implementing Q Learning (Greedy Epsilon) on the Frozen Lake Game 72 | This function populations the empty Q matrix 73 | Parameters 74 | 75 | prior_state - int - the prior state of the environment before the current timestemp 76 | observation - int - the current state of the environment 77 | reward - int - the reward yielded from the environment after an action 78 | action - int - the action suggested by the epsilon greedy algorithm 79 | 80 | :return: None 81 | ''' 82 | 83 | for episode in range(n_episodes): 84 | prior_state = environment.reset() 85 | _ = 0 86 | 87 | while _ < max_steps: 88 | 89 | if render == True: environment.render() 90 | action = exploit_explore(prior_state) 91 | observation, reward, done, info = environment.step(action) 92 | 93 | update_q_matrix(prior_state=prior_state, 94 | observation=observation, 95 | reward=reward, 96 | action=action) 97 | 98 | prior_state = observation 99 | _ += 1 100 | 101 | if done: 102 | break 103 | 104 | 105 | def play_frozen_lake(n_episodes): 106 | 107 | ''' 108 | Directly implementing Q Learning (Greedy Epsilon) on the Frozen Lake Game 109 | This function uses the already populated Q Matrix and displays the game being used 110 | 111 | Parameters 112 | 113 | prior_state - int - the prior state of the environment before the current timestemp 114 | observation - int - the current state of the environment 115 | reward - int - the reward yielded from the environment after an action 116 | action - int - the action suggested by the epsilon greedy algorithm 117 | 118 | :return: None 119 | ''' 120 | 121 | for episode in range(n_episodes): 122 | print('Episode: %s'%episode+1) 123 | prior_state = environment.reset() 124 | done = False 125 | 126 | while not done: 127 | environment.render() 128 | action = choose_action(prior_state) 129 | observation, reward, done, info = environment.step(action) 130 | prior_state = observation 131 | if reward == 0: 132 | time.sleep(0.5) 133 | else: 134 | print('You have won on episode %s!'%(episode+1)) 135 | time.sleep(5) 136 | os.system('clear') 137 | 138 | if done and reward == -1: 139 | print('You have lost this episode... :-/') 140 | time.sleep(5) 141 | os.system('clear') 142 | break 143 | 144 | 145 | 146 | if __name__ == '__main__': 147 | 148 | 149 | populate_q_matrix(render=False) 150 | play_frozen_lake(n_episodes=10) 151 | -------------------------------------------------------------------------------- /chapter4/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/.DS_Store -------------------------------------------------------------------------------- /chapter4/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/__init__.py -------------------------------------------------------------------------------- /chapter4/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/__init__.pyc -------------------------------------------------------------------------------- /chapter4/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /chapter4/__pycache__/market_making_example.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter4/__pycache__/market_making_example.cpython-36.pyc -------------------------------------------------------------------------------- /chapter4/market_making_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Mar 25 15:00:05 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import random, tensorflow as tf, numpy as np, matplotlib.pyplot as plt 10 | from tgym.envs import SpreadTrading 11 | from tgym.gens.deterministic import WavySignal 12 | from neural_networks.market_making_models import DeepQNetworkMM, Memory 13 | from chapter2.cart_pole_example import calculate_discounted_reward 14 | from neural_networks.policy_gradient_utilities import PolicyGradient 15 | from tgym.gens.csvstream import CSVStreamer 16 | 17 | #Parameters 18 | np.random.seed(2018) 19 | n_episodes = 1 20 | trading_fee = .2 21 | time_fee = 0 22 | history_length = 2 23 | memory_size = 2000 24 | gamma = 0.96 25 | epsilon_min = 0.01 26 | batch_size = 64 27 | action_size = len(SpreadTrading._actions) 28 | learning_rate = 1e-2 29 | n_layers = 4 30 | n_units = 500 31 | n_classes = 3 32 | goal = 190 33 | max_steps = 1000 34 | explore_start = 1.0 35 | explore_stop = 0.01 36 | decay_rate = 1e-4 37 | _lambda = 0.95 38 | value_coefficient = 0.5 39 | entropy_coefficient = 0.01 40 | max_grad_norm = 0.5 41 | log_interval = 10 42 | hold = np.array([1, 0, 0]) 43 | buy = np.array([0, 1, 0]) 44 | sell = np.array([0, 0, 1]) 45 | possible_actions = [hold, buy, sell] 46 | 47 | #Classes and variables 48 | generator = CSVStreamer(filename='/Users/tawehbeysolow/Downloads/amazon_order_book_data2.csv') 49 | #generator = WavySignal(period_1=25, period_2=50, epsilon=-0.5) 50 | 51 | memory = Memory(max_size=memory_size) 52 | 53 | environment = SpreadTrading(spread_coefficients=[1], 54 | data_generator=generator, 55 | trading_fee=trading_fee, 56 | time_fee=time_fee, 57 | history_length=history_length) 58 | 59 | state_size = len(environment.reset()) 60 | 61 | 62 | def baseline_model(n_actions, info, random=False): 63 | 64 | if random == True: 65 | action = np.random.choice(range(n_actions), p=np.repeat(1/float(n_actions), 3)) 66 | action = possible_actions[action] 67 | 68 | else: 69 | 70 | if len(info) == 0: 71 | action = np.random.choice(range(n_actions), p=np.repeat(1/float(n_actions), 3)) 72 | action = possible_actions[action] 73 | 74 | elif info['action'] == 'sell': 75 | action = buy 76 | 77 | else: 78 | action = sell 79 | 80 | return action 81 | 82 | 83 | def score_model(model, n_tests): 84 | scores = [] 85 | for _ in range(n_tests): 86 | environment.reset() 87 | observation = environment.reset() 88 | reward_sum = 0 89 | while True: 90 | '' 91 | #environment.render() 92 | 93 | predict = model.predict([observation.reshape(1, 8)])[0] 94 | action = possible_actions[np.argmax(predict)] 95 | observation, reward, done, _ = environment.step(action) 96 | reward_sum += reward 97 | if done: 98 | break 99 | scores.append(reward_sum) 100 | 101 | return np.mean(scores) 102 | 103 | 104 | def exploit_explore(session, model, explore_start, explore_stop, decay_rate, decay_step, state, actions): 105 | exp_exp_tradeoff = np.random.rand() 106 | explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step) 107 | 108 | if (explore_probability > exp_exp_tradeoff): 109 | action = random.choice(possible_actions) 110 | 111 | else: 112 | Qs = session.run(model.output_layer, feed_dict = {model.input_matrix: state.reshape((1, 8))}) 113 | choice = np.argmax(Qs) 114 | action = possible_actions[int(choice)] 115 | 116 | return action, explore_probability 117 | 118 | 119 | def train_model(environment, dql=None, pg=None, baseline=None): 120 | scores = [] 121 | done = False 122 | error_rate, step = 0, 0 123 | info = {} 124 | n_episode, reward_sum, score, episode_done = 0, 0, 0, False 125 | n_actions = len(SpreadTrading._actions) 126 | observation = environment.reset() 127 | states = np.empty(0).reshape(0, state_size) 128 | actions = np.empty(0).reshape(0, len(SpreadTrading._actions)) 129 | rewards = np.empty(0).reshape(0, 1) 130 | discounted_rewards = np.empty(0).reshape(0, 1) 131 | observation = environment.reset() 132 | 133 | if baseline == True: 134 | 135 | 136 | for episode in range(n_episodes): 137 | 138 | for _ in range(100): 139 | action = baseline_model(n_actions=n_actions, 140 | info=info) 141 | 142 | state, reward, done, info = environment.step(action) 143 | reward_sum += reward 144 | 145 | next_state = np.zeros((state_size,), dtype=np.int) 146 | step = max_steps 147 | scores.append(reward_sum) 148 | memory.add((state, action, reward, next_state, done)) 149 | 150 | print('Episode: {}'.format(episode), 151 | 'Total reward: {}'.format(reward_sum)) 152 | 153 | reward_sum = 0 154 | 155 | environment.reset() 156 | 157 | print(np.mean(scores)) 158 | plt.hist(scores) 159 | plt.xlabel('Distribution of Scores') 160 | plt.ylabel('Relative Frequency') 161 | plt.show() 162 | plt.waitforbuttonpress() 163 | plt.close() 164 | 165 | 166 | elif dql == True: 167 | 168 | loss = [] 169 | 170 | model = DeepQNetworkMM(n_units=n_units, 171 | n_classes=n_classes, 172 | state_size=state_size, 173 | action_size=action_size, 174 | learning_rate=learning_rate) 175 | 176 | #tf.summary.scalar('Loss', model.error_rate) 177 | 178 | 179 | with tf.Session() as sess: 180 | 181 | sess.run(tf.global_variables_initializer()) 182 | decay_step = 0 183 | 184 | for episode in range(n_episodes): 185 | 186 | current_step, reward_sum = 0, [] 187 | state = np.reshape(observation, [1, state_size]) 188 | 189 | while current_step < max_steps: 190 | 191 | current_step += 1; decay_step += 1 192 | 193 | action, explore_probability = exploit_explore(session=sess, 194 | model=model, 195 | explore_start=explore_start, 196 | explore_stop=explore_stop, 197 | decay_rate=decay_rate, 198 | decay_step=decay_step, 199 | state=state, 200 | actions=possible_actions) 201 | 202 | state, reward, done, info = environment.step(action) 203 | reward_sum.append(reward) 204 | 205 | if current_step >= max_steps: 206 | done = True 207 | 208 | if done == True: 209 | 210 | next_state = np.zeros((state_size,), dtype=np.int) 211 | step = max_steps 212 | total_reward = np.sum(reward_sum) 213 | scores.append(total_reward) 214 | memory.add((state, action, reward, next_state, done)) 215 | 216 | print('Episode: {}'.format(episode), 217 | 'Total reward: {}'.format(total_reward), 218 | 'Loss: {}'.format(error_rate), 219 | 'Explore P: {:.4f}'.format(explore_probability)) 220 | 221 | loss.append(error_rate) 222 | 223 | elif done != True: 224 | 225 | next_state = environment.reset() 226 | state = next_state 227 | memory.add((state, action, reward, next_state, done)) 228 | 229 | batch = memory.sample(batch_size) 230 | states = np.array([each[0] for each in batch]) 231 | actions = np.array([each[1] for each in batch]) 232 | rewards = np.array([each[2] for each in batch]) 233 | next_states = np.array([each[3] for each in batch]) 234 | dones = np.array([each[4] for each in batch]) 235 | 236 | target_Qs_batch = [] 237 | 238 | Qs_next_state = sess.run(model.predicted_Q, feed_dict={model.input_matrix: next_states, model.actions: actions}) 239 | 240 | for i in range(0, len(batch)): 241 | terminal = dones[i] 242 | 243 | if terminal: 244 | target_Qs_batch.append(rewards[i]) 245 | 246 | else: 247 | target = rewards[i] + gamma * np.max(Qs_next_state[i]) 248 | target_Qs_batch.append(target) 249 | 250 | 251 | targets = np.array([each for each in target_Qs_batch]) 252 | 253 | error_rate, _ = sess.run([model.error_rate, model.optimizer], 254 | feed_dict={model.input_matrix: states, 255 | model.target_Q: targets, 256 | model.actions: actions}) 257 | if episode == n_episodes - 1: 258 | 259 | market_making(model=model, 260 | environment=environment, 261 | sess=sess, 262 | state=state, 263 | dpl=True) 264 | 265 | elif pg == True: 266 | 267 | loss = [] 268 | 269 | mlp_model = PolicyGradient(n_units=n_units, 270 | n_layers=n_layers, 271 | n_columns=8, 272 | n_outputs=n_classes, 273 | learning_rate=learning_rate, 274 | hidden_activation='selu', 275 | output_activation='softmax', 276 | loss_function='categorical_crossentropy') 277 | 278 | policy_model, model_predictions = mlp_model.create_policy_model(input_shape=(len(observation), )) 279 | 280 | policy_model.summary() 281 | 282 | while n_episode < n_episodes: 283 | 284 | state = observation.reshape(1, 8) 285 | prediction = model_predictions.predict([state])[0] 286 | action = np.random.choice(range(len(SpreadTrading._actions)), p=prediction) 287 | action = possible_actions[action] 288 | states = np.vstack([states, state]) 289 | actions = np.vstack([actions, action]) 290 | 291 | observation, reward, episode_done, info = environment.step(action) 292 | reward_sum += reward 293 | rewards = np.vstack([rewards, reward]) 294 | step += 1 295 | 296 | if step == max_steps: 297 | episode_done = True 298 | 299 | if episode_done == True: 300 | 301 | discounted_reward = calculate_discounted_reward(rewards, gamma=gamma) 302 | discounted_rewards = np.vstack([discounted_rewards, discounted_reward]) 303 | 304 | discounted_rewards -= discounted_rewards.mean() 305 | discounted_rewards /= discounted_rewards.std() 306 | discounted_rewards = discounted_rewards.squeeze() 307 | actions = actions.squeeze().astype(int) 308 | 309 | #train_actions = np.zeros([len(actions), n_actions]) 310 | #train_actions[np.arange(len(actions)), actions] = 1 311 | 312 | error = policy_model.train_on_batch([states, discounted_rewards], actions) 313 | loss.append(error) 314 | 315 | states = np.empty(0).reshape(0, 8) 316 | actions = np.empty(0).reshape(0, 3) 317 | rewards = np.empty(0).reshape(0, 1) 318 | discounted_rewards = np.empty(0).reshape(0, 1) 319 | 320 | score = score_model(model=model_predictions, n_tests=10) 321 | 322 | print('''\nEpisode: %s \nAverage Reward: %s \nScore: %s \nError: %s''' 323 | )%(n_episode+1, reward_sum/float(batch_size), score, np.mean(loss[-batch_size:])) 324 | 325 | if score >= goal: 326 | break 327 | 328 | reward_sum = 0 329 | 330 | n_episode += 1 331 | observation = environment.reset() 332 | 333 | if n_episode == n_episodes - 1: 334 | 335 | market_making(model=model_predictions, 336 | environment=environment, 337 | sess=None, 338 | state=state, 339 | pg=True) 340 | 341 | if baseline != True: 342 | 343 | plt.title('Policy Gradient Error plot over %s Episodes'%(n_episode+1)) 344 | plt.xlabel('N batches') 345 | plt.ylabel('Error Rate') 346 | plt.plot(loss) 347 | plt.show() 348 | plt.waitforbuttonpress() 349 | return model 350 | 351 | def market_making(model, environment, sess, state, dpl=None, pg=None): 352 | 353 | scores = [] 354 | total_reward = 0 355 | environment.reset() 356 | 357 | for _ in range(1000): 358 | 359 | for __ in range(100): 360 | 361 | state = np.reshape(state, [1, state_size]) 362 | 363 | if dpl == True: 364 | Q_matrix = sess.run(model.output_layer, feed_dict = {model.input_matrix: state.reshape((1, 8))}) 365 | choice = np.argmax(Q_matrix) 366 | action = possible_actions[int(choice)] 367 | 368 | elif pg == True: 369 | state = np.reshape(state, [1, 8]) 370 | predict = model.predict([state])[0] 371 | action = np.argmax(predict) 372 | action = possible_actions[int(action)] 373 | 374 | state, reward, done, info = environment.step(action) 375 | total_reward += reward 376 | 377 | 378 | print('Episode: {}'.format(_), 379 | 'Total reward: {}'.format(total_reward)) 380 | scores.append(total_reward) 381 | state = environment.reset() 382 | 383 | print(np.mean(scores)) 384 | plt.hist(scores) 385 | plt.xlabel('Distribution of Scores') 386 | plt.ylabel('Relative Frequency') 387 | plt.show() 388 | plt.waitforbuttonpress() 389 | plt.close() 390 | 391 | 392 | if __name__ == '__main__': 393 | 394 | 395 | train_model(environment=environment, dql=True) 396 | 397 | 398 | 399 | 400 | 401 | 402 | -------------------------------------------------------------------------------- /chapter5/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter5/.DS_Store -------------------------------------------------------------------------------- /chapter5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/chapter5/__init__.py -------------------------------------------------------------------------------- /chapter5/create_environment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri May 10 10:44:23 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import cv2, gym, numpy as np 10 | from retro_contest.local import make 11 | from retro import make as make_retro 12 | from baselines.common.atari_wrappers import FrameStack 13 | 14 | cv2.ocl.setUseOpenCL(False) 15 | 16 | class PreprocessFrame(gym.ObservationWrapper): 17 | """ 18 | Grayscaling image from three dimensional RGB pixelated images 19 | - Set frame to gray 20 | - Resize the frame to 96x96x1 21 | """ 22 | def __init__(self, environment, width, height): 23 | gym.ObservationWrapper.__init__(self, environment) 24 | self.width = width 25 | self.height = height 26 | self.observation_space = gym.spaces.Box(low=0, 27 | high=255, 28 | shape=(self.height, self.width, 1), 29 | dtype=np.uint8) 30 | 31 | def observation(self, image): 32 | image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 33 | image = cv2.resize(image, (self.width, self.height), interpolation=cv2.INTER_AREA) 34 | image = image[:, :, None] 35 | return image 36 | 37 | 38 | class ActionsDiscretizer(gym.ActionWrapper): 39 | """ 40 | Wrap a gym-retro environment and make it use discrete 41 | actions for the Sonic game. 42 | """ 43 | def __init__(self, env): 44 | super(ActionsDiscretizer, self).__init__(env) 45 | buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"] 46 | actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'], 47 | ['DOWN', 'B'], ['B']] 48 | self._actions = [] 49 | 50 | """ 51 | What we do in this loop: 52 | For each action in actions 53 | - Create an array of 12 False (12 = nb of buttons) 54 | For each button in action: (for instance ['LEFT']) we need to make that left button index = True 55 | - Then the button index = LEFT = True 56 | In fact at the end we will have an array where each array is an action and each elements True of this array 57 | are the buttons clicked. 58 | """ 59 | for action in actions: 60 | _actions = np.array([False] * len(buttons)) 61 | for button in action: 62 | _actions[buttons.index(button)] = True 63 | self._actions.append(_actions) 64 | self.action_space = gym.spaces.Discrete(len(self._actions)) 65 | 66 | def action(self, a): 67 | return self._actions[a].copy() 68 | 69 | class RewardScaler(gym.RewardWrapper): 70 | """ 71 | Bring rewards to a reasonable scale for PPO. 72 | This is incredibly important and effects performance 73 | drastically. 74 | """ 75 | def reward(self, reward): 76 | 77 | return reward * 0.01 78 | 79 | class AllowBacktracking(gym.Wrapper): 80 | """ 81 | Use deltas in max(X) as the reward, rather than deltas 82 | in X. This way, agents are not discouraged too heavily 83 | from exploring backwards if there is no way to advance 84 | head-on in the level. 85 | """ 86 | def __init__(self, environment): 87 | super(AllowBacktracking, self).__init__(environment) 88 | self.curent_reward = 0 89 | self.max_reward = 0 90 | 91 | def reset(self, **kwargs): 92 | self.current_reward = 0 93 | self.max_reward = 0 94 | return self.env.reset(**kwargs) 95 | 96 | def step(self, action): 97 | observation, reward, done, info = self.environment.step(action) 98 | self.current_reward += reward 99 | reward = max(0, self.current_reward - self.max_reward) 100 | self.max_reward = max(self.max_reward, self.current_reward) 101 | return observation, reward, done, info 102 | 103 | def wrap_environment(environment, n_frames=4): 104 | environment = ActionsDiscretizer(environment) 105 | environment = RewardScaler(environment) 106 | environment = PreprocessFrame(environment) 107 | environment = FrameStack(environment, n_frames) 108 | environment = AllowBacktracking(environment) 109 | return environment 110 | 111 | def create_new_environment(environment_index, n_frames=4): 112 | """ 113 | Create an environment with some standard wrappers. 114 | """ 115 | 116 | dictionary = [ 117 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'SpringYardZone.Act3'}, 118 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'SpringYardZone.Act2'}, 119 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'GreenHillZone.Act3'}, 120 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'GreenHillZone.Act1'}, 121 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'StarLightZone.Act2'}, 122 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'StarLightZone.Act1'}, 123 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'MarbleZone.Act2'}, 124 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'MarbleZone.Act1'}, 125 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'MarbleZone.Act3'}, 126 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'ScrapBrainZone.Act2'}, 127 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'LabyrinthZone.Act2'}, 128 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'LabyrinthZone.Act1'}, 129 | {'game': 'SonicTheHedgehog-Genesis', 'state': 'LabyrinthZone.Act3'}] 130 | 131 | print(dictionary[environment_index]['game']) 132 | print(dictionary[environment_index]['state']) 133 | 134 | environment = make(game=dictionary[environment_index]['game'], 135 | state=dictionary[environment_index]['state'], 136 | bk2dir="./records") 137 | 138 | environment = wrap_environment(environment=environment, 139 | n_frames=n_frames) 140 | 141 | return environment 142 | 143 | 144 | def make_test_level_Green(): 145 | return make_test() 146 | 147 | 148 | def make_test(n_frames=4): 149 | """ 150 | Create an environment with some standard wrappers. 151 | """ 152 | 153 | environment = make_retro(game='SonicTheHedgehog-Genesis', 154 | state='GreenHillZone.Act2', 155 | record="./records") 156 | 157 | environment = wrap_environment(environment=environment, 158 | n_frames=n_frames) 159 | 160 | return environment 161 | 162 | -------------------------------------------------------------------------------- /chapter5/sonic_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun May 12 06:18:09 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | from algorithms.actor_critic_utilities import Model 10 | from chapter5.create_environment import create_new_environment 11 | 12 | class Worker(): 13 | def __init__(self,game,name,s_size,a_size,trainer,model_path,global_episodes): 14 | self.name = "worker_" + str(name) 15 | self.number = name 16 | self.model_path = model_path 17 | self.trainer = trainer 18 | self.global_episodes = global_episodes 19 | self.increment = self.global_episodes.assign_add(1) 20 | self.episode_rewards = [] 21 | self.episode_lengths = [] 22 | self.episode_mean_values = [] 23 | self.summary_writer = tf.summary.FileWriter("train_"+str(self.number)) 24 | 25 | #Create the local copy of the network and the tensorflow op to copy global paramters to local network 26 | self.local_AC = AC_Network(s_size,a_size,self.name,trainer) 27 | self.update_local_ops = update_target_graph('global',self.name) 28 | 29 | def train(self,rollout,sess,gamma,bootstrap_value): 30 | rollout = np.array(rollout) 31 | observations = rollout[:,0] 32 | actions = rollout[:,1] 33 | rewards = rollout[:,2] 34 | next_observations = rollout[:,3] 35 | values = rollout[:,5] 36 | 37 | # Here we take the rewards and values from the rollout, and use them to 38 | # generate the advantage and discounted returns. 39 | # The advantage function uses "Generalized Advantage Estimation" 40 | self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) 41 | discounted_rewards = discount(self.rewards_plus,gamma)[:-1] 42 | self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) 43 | advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] 44 | advantages = discount(advantages,gamma) 45 | 46 | # Update the global network using gradients from loss 47 | # Generate network statistics to periodically save 48 | feed_dict = {self.local_AC.target_v:discounted_rewards, 49 | self.local_AC.inputs:np.vstack(observations), 50 | self.local_AC.actions:actions, 51 | self.local_AC.advantages:advantages, 52 | self.local_AC.state_in[0]:self.batch_rnn_state[0], 53 | self.local_AC.state_in[1]:self.batch_rnn_state[1]} 54 | 55 | v_l,p_l,e_l,g_n,v_n, self.batch_rnn_state,_ = sess.run([self.local_AC.value_loss, 56 | self.local_AC.policy_loss, 57 | self.local_AC.entropy, 58 | self.local_AC.grad_norms, 59 | self.local_AC.var_norms, 60 | self.local_AC.state_out, 61 | self.local_AC.apply_grads], 62 | feed_dict=feed_dict) 63 | 64 | return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n 65 | 66 | def work(self,max_episode_length,gamma,sess,coord,saver): 67 | episode_count = sess.run(self.global_episodes) 68 | total_steps = 0 69 | print ("Starting worker " + str(self.number)) 70 | with sess.as_default(), sess.graph.as_default(): 71 | while not coord.should_stop(): 72 | sess.run(self.update_local_ops) 73 | episode_buffer = [] 74 | episode_values = [] 75 | episode_frames = [] 76 | episode_reward = 0 77 | episode_step_count = 0 78 | d = False 79 | 80 | self.env.new_episode() 81 | prior_state = self.env.get_state().screen_buffer 82 | episode_frames.append(prior_state) 83 | prior_state = process_frame(prior_state) 84 | rnn_state = self.local_AC.state_init 85 | self.batch_rnn_state = rnn_state 86 | while self.env.is_episode_finished() == False: 87 | #Take an action using probabilities from policy network output. 88 | action_dist, value_function, rnn_state = sess.run([self.local_AC.policy, self.local_AC.value,self.local_AC.state_out], 89 | feed_dict={self.local_AC.inputs:[prior_state], 90 | self.local_AC.state_in[0]:rnn_state[0], 91 | self.local_AC.state_in[1]:rnn_state[1]}) 92 | 93 | action = np.random.choice(action_dist[0], p=action_dist[0]) 94 | action = np.argmax(action_dist == action) 95 | 96 | reward = self.env.make_action(self.actions[action]) / 100.0 97 | done = self.env.is_episode_finished() 98 | if done == False: 99 | current_state = self.env.get_state().screen_buffer 100 | episode_frames.append(current_state) 101 | prior_state = process_frame(current_state) 102 | else: 103 | current_state = prior_state 104 | 105 | episode_buffer.append([prior_state, action, reward, current_state, done, value[0,0]]) 106 | episode_values.append(value[0,0]) 107 | 108 | episode_reward += r 109 | s = s1 110 | total_steps += 1 111 | episode_step_count += 1 112 | 113 | # If the episode hasn't ended, but the experience buffer is full, then we 114 | # make an update step using that experience rollout. 115 | if len(episode_buffer) == 30 and d != True and episode_step_count != max_episode_length - 1: 116 | # Since we don't know what the true final return is, we "bootstrap" from our current 117 | # value estimation. 118 | v1 = sess.run(self.local_AC.value, 119 | feed_dict={self.local_AC.inputs:[s], 120 | self.local_AC.state_in[0]:rnn_state[0], 121 | self.local_AC.state_in[1]:rnn_state[1]})[0,0] 122 | v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1) 123 | episode_buffer = [] 124 | sess.run(self.update_local_ops) 125 | if d == True: 126 | break 127 | 128 | self.episode_rewards.append(episode_reward) 129 | self.episode_lengths.append(episode_step_count) 130 | self.episode_mean_values.append(np.mean(episode_values)) 131 | 132 | # Update the network using the episode buffer at the end of the episode. 133 | if len(episode_buffer) != 0: 134 | v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0) 135 | 136 | 137 | # Periodically save gifs of episodes, model parameters, and summary statistics. 138 | if episode_count % 5 == 0 and episode_count != 0: 139 | if self.name == 'worker_0' and episode_count % 25 == 0: 140 | time_per_step = 0.05 141 | images = np.array(episode_frames) 142 | make_gif(images,'./frames/image'+str(episode_count)+'.gif', 143 | duration=len(images)*time_per_step,true_image=True,salience=False) 144 | if episode_count % 250 == 0 and self.name == 'worker_0': 145 | saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk') 146 | print ("Saved Model") 147 | 148 | mean_reward = np.mean(self.episode_rewards[-5:]) 149 | mean_length = np.mean(self.episode_lengths[-5:]) 150 | mean_value = np.mean(self.episode_mean_values[-5:]) 151 | summary = tf.Summary() 152 | summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) 153 | summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) 154 | summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) 155 | summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) 156 | summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) 157 | summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) 158 | summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n)) 159 | summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n)) 160 | self.summary_writer.add_summary(summary, episode_count) 161 | 162 | self.summary_writer.flush() 163 | if self.name == 'worker_0': 164 | sess.run(self.increment) 165 | episode_count += 1 166 | 167 | def play_sonic(policy, environment_index): 168 | 169 | 170 | environment = create_new_environment(environment_index=environment_index) 171 | observation = environment.observation_space 172 | actions = environment.action_space 173 | 174 | 175 | model = Model(policy=policy, 176 | ob_space=observation, 177 | action_space=actions, 178 | n_environments=1, 179 | n_steps=1, 180 | entropy_coefficient=0, 181 | value_coefficient=0, 182 | max_grad_norm=0) 183 | 184 | observation = environment.reset() 185 | score = 0 186 | boom = 0 187 | done = False 188 | 189 | with tf.device("/cpu:0"): 190 | master_network = AC_Network(s_size,a_size,'global',None) # Generate global network 191 | num_workers = multiprocessing.cpu_count() # Set workers ot number of available CPU threads 192 | workers = [] 193 | # Create worker classes 194 | for i in range(num_workers): 195 | 196 | workers.append(Worker(environment=environment, 197 | name=i, 198 | s_size=s_size, 199 | a_sizse=a_size, 200 | trainer=trainer, 201 | saver=saver, 202 | model_path)) 203 | 204 | with tf.Session() as sess: 205 | 206 | coord = tf.train.Coordinator() 207 | if load_model == True: 208 | print 'Loading Model...' 209 | ckpt = tf.train.get_checkpoint_state(model_path) 210 | saver.restore(sess,ckpt.model_checkpoint_path) 211 | else: 212 | sess.run(tf.global_variables_initializer()) 213 | 214 | # This is where the asynchronous magic happens. 215 | # Start the "work" process for each worker in a separate threat. 216 | worker_threads = [] 217 | for worker in workers: 218 | worker_work = lambda: worker.work(max_episode_length=max_episode_length, 219 | gamma=gamma, 220 | master_network=master_network, 221 | sess=sess, 222 | coord=coord) 223 | 224 | 225 | t = threading.Thread(target=(worker_work)) 226 | t.start() 227 | worker_threads.append(t) 228 | coord.join(worker_threads) 229 | 230 | while done == False: 231 | 232 | actions, values = model.step(observation) 233 | observation, rewards, done, _ = environment.step(actions) 234 | score += rewards 235 | environment.render() 236 | boom +=1 237 | 238 | 239 | print("Score ", score) 240 | environment.close() 241 | -------------------------------------------------------------------------------- /errata.md: -------------------------------------------------------------------------------- 1 | # Errata for *Book Title* 2 | 3 | On **page xx** [Summary of error]: 4 | 5 | Details of error here. Highlight key pieces in **bold**. 6 | 7 | *** 8 | 9 | On **page xx** [Summary of error]: 10 | 11 | Details of error here. Highlight key pieces in **bold**. 12 | 13 | *** -------------------------------------------------------------------------------- /neural_networks/Figure_1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/Figure_1-1.png -------------------------------------------------------------------------------- /neural_networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/__init__.py -------------------------------------------------------------------------------- /neural_networks/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/__init__.pyc -------------------------------------------------------------------------------- /neural_networks/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /neural_networks/__pycache__/models.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/__pycache__/models.cpython-36.pyc -------------------------------------------------------------------------------- /neural_networks/gym_utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Apr 1 00:30:39 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import tensorflow as tf 10 | import numpy as np 11 | import baselines.common.tf_util as U 12 | from baselines.a2c.utils import fc 13 | from tensorflow.python.ops import math_ops 14 | 15 | class Pd(object): 16 | """ 17 | A particular probability distribution 18 | """ 19 | def flatparam(self): 20 | raise NotImplementedError 21 | def mode(self): 22 | raise NotImplementedError 23 | def neglogp(self, x): 24 | # Usually it's easier to define the negative logprob 25 | raise NotImplementedError 26 | def kl(self, other): 27 | raise NotImplementedError 28 | def entropy(self): 29 | raise NotImplementedError 30 | def sample(self): 31 | raise NotImplementedError 32 | def logp(self, x): 33 | return - self.neglogp(x) 34 | def get_shape(self): 35 | return self.flatparam().shape 36 | @property 37 | def shape(self): 38 | return self.get_shape() 39 | def __getitem__(self, idx): 40 | return self.__class__(self.flatparam()[idx]) 41 | 42 | class PdType(object): 43 | """ 44 | Parametrized family of probability distributions 45 | """ 46 | def pdclass(self): 47 | raise NotImplementedError 48 | def pdfromflat(self, flat): 49 | return self.pdclass()(flat) 50 | def pdfromlatent(self, latent_vector, init_scale, init_bias): 51 | raise NotImplementedError 52 | def param_shape(self): 53 | raise NotImplementedError 54 | def sample_shape(self): 55 | raise NotImplementedError 56 | def sample_dtype(self): 57 | raise NotImplementedError 58 | 59 | def param_placeholder(self, prepend_shape, name=None): 60 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) 61 | def sample_placeholder(self, prepend_shape, name=None): 62 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) 63 | 64 | def __eq__(self, other): 65 | return (type(self) == type(other)) and (self.__dict__ == other.__dict__) 66 | 67 | class CategoricalPdType(PdType): 68 | def __init__(self, ncat): 69 | self.ncat = ncat 70 | def pdclass(self): 71 | return CategoricalPd 72 | def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): 73 | pdparam = _matching_fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias) 74 | return self.pdfromflat(pdparam), pdparam 75 | 76 | def param_shape(self): 77 | return [self.ncat] 78 | def sample_shape(self): 79 | return [] 80 | def sample_dtype(self): 81 | return tf.int32 82 | 83 | 84 | class MultiCategoricalPdType(PdType): 85 | def __init__(self, nvec): 86 | self.ncats = nvec.astype('int32') 87 | assert (self.ncats > 0).all() 88 | def pdclass(self): 89 | return MultiCategoricalPd 90 | def pdfromflat(self, flat): 91 | return MultiCategoricalPd(self.ncats, flat) 92 | 93 | def pdfromlatent(self, latent, init_scale=1.0, init_bias=0.0): 94 | pdparam = _matching_fc(latent, 'pi', self.ncats.sum(), init_scale=init_scale, init_bias=init_bias) 95 | return self.pdfromflat(pdparam), pdparam 96 | 97 | def param_shape(self): 98 | return [sum(self.ncats)] 99 | def sample_shape(self): 100 | return [len(self.ncats)] 101 | def sample_dtype(self): 102 | return tf.int32 103 | 104 | class DiagGaussianPdType(PdType): 105 | def __init__(self, size): 106 | self.size = size 107 | def pdclass(self): 108 | return DiagGaussianPd 109 | 110 | def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): 111 | mean = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) 112 | logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) 113 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) 114 | return self.pdfromflat(pdparam), mean 115 | 116 | def param_shape(self): 117 | return [2*self.size] 118 | def sample_shape(self): 119 | return [self.size] 120 | def sample_dtype(self): 121 | return tf.float32 122 | 123 | class BernoulliPdType(PdType): 124 | def __init__(self, size): 125 | self.size = size 126 | def pdclass(self): 127 | return BernoulliPd 128 | def param_shape(self): 129 | return [self.size] 130 | def sample_shape(self): 131 | return [self.size] 132 | def sample_dtype(self): 133 | return tf.int32 134 | def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): 135 | pdparam = _matching_fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) 136 | return self.pdfromflat(pdparam), pdparam 137 | 138 | # WRONG SECOND DERIVATIVES 139 | # class CategoricalPd(Pd): 140 | # def __init__(self, logits): 141 | # self.logits = logits 142 | # self.ps = tf.nn.softmax(logits) 143 | # @classmethod 144 | # def fromflat(cls, flat): 145 | # return cls(flat) 146 | # def flatparam(self): 147 | # return self.logits 148 | # def mode(self): 149 | # return U.argmax(self.logits, axis=-1) 150 | # def logp(self, x): 151 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) 152 | # def kl(self, other): 153 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ 154 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 155 | # def entropy(self): 156 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 157 | # def sample(self): 158 | # u = tf.random_uniform(tf.shape(self.logits)) 159 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) 160 | 161 | class CategoricalPd(Pd): 162 | def __init__(self, logits): 163 | self.logits = logits 164 | def flatparam(self): 165 | return self.logits 166 | def mode(self): 167 | return tf.argmax(self.logits, axis=-1) 168 | 169 | @property 170 | def mean(self): 171 | return tf.nn.softmax(self.logits) 172 | def neglogp(self, x): 173 | # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 174 | # Note: we can't use sparse_softmax_cross_entropy_with_logits because 175 | # the implementation does not allow second-order derivatives... 176 | if x.dtype in {tf.uint8, tf.int32, tf.int64}: 177 | # one-hot encoding 178 | x_shape_list = x.shape.as_list() 179 | logits_shape_list = self.logits.get_shape().as_list()[:-1] 180 | for xs, ls in zip(x_shape_list, logits_shape_list): 181 | if xs is not None and ls is not None: 182 | assert xs == ls, 'shape mismatch: {} in x vs {} in logits'.format(xs, ls) 183 | 184 | x = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) 185 | else: 186 | # already encoded 187 | assert x.shape.as_list() == self.logits.shape.as_list() 188 | 189 | return tf.nn.softmax_cross_entropy_with_logits_v2( 190 | logits=self.logits, 191 | labels=x) 192 | def kl(self, other): 193 | a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) 194 | a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True) 195 | ea0 = tf.exp(a0) 196 | ea1 = tf.exp(a1) 197 | z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) 198 | z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True) 199 | p0 = ea0 / z0 200 | return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) 201 | def entropy(self): 202 | a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) 203 | ea0 = tf.exp(a0) 204 | z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) 205 | p0 = ea0 / z0 206 | return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1) 207 | def sample(self): 208 | u = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype) 209 | return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) 210 | @classmethod 211 | def fromflat(cls, flat): 212 | return cls(flat) 213 | 214 | class MultiCategoricalPd(Pd): 215 | def __init__(self, nvec, flat): 216 | self.flat = flat 217 | self.categoricals = list(map(CategoricalPd, 218 | tf.split(flat, np.array(nvec, dtype=np.int32), axis=-1))) 219 | def flatparam(self): 220 | return self.flat 221 | def mode(self): 222 | return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) 223 | def neglogp(self, x): 224 | return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))]) 225 | def kl(self, other): 226 | return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)]) 227 | def entropy(self): 228 | return tf.add_n([p.entropy() for p in self.categoricals]) 229 | def sample(self): 230 | return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) 231 | @classmethod 232 | def fromflat(cls, flat): 233 | raise NotImplementedError 234 | 235 | class DiagGaussianPd(Pd): 236 | def __init__(self, flat): 237 | self.flat = flat 238 | mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat) 239 | self.mean = mean 240 | self.logstd = logstd 241 | self.std = tf.exp(logstd) 242 | def flatparam(self): 243 | return self.flat 244 | def mode(self): 245 | return self.mean 246 | def neglogp(self, x): 247 | return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ 248 | + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \ 249 | + tf.reduce_sum(self.logstd, axis=-1) 250 | def kl(self, other): 251 | assert isinstance(other, DiagGaussianPd) 252 | return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1) 253 | def entropy(self): 254 | return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) 255 | def sample(self): 256 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) 257 | @classmethod 258 | def fromflat(cls, flat): 259 | return cls(flat) 260 | 261 | 262 | class BernoulliPd(Pd): 263 | def __init__(self, logits): 264 | self.logits = logits 265 | self.ps = tf.sigmoid(logits) 266 | def flatparam(self): 267 | return self.logits 268 | @property 269 | def mean(self): 270 | return self.ps 271 | def mode(self): 272 | return tf.round(self.ps) 273 | def neglogp(self, x): 274 | return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1) 275 | def kl(self, other): 276 | return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) 277 | def entropy(self): 278 | return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) 279 | def sample(self): 280 | u = tf.random_uniform(tf.shape(self.ps)) 281 | return tf.to_float(math_ops.less(u, self.ps)) 282 | @classmethod 283 | def fromflat(cls, flat): 284 | return cls(flat) 285 | 286 | def make_pdtype(ac_space): 287 | from gym import spaces 288 | if isinstance(ac_space, spaces.Box): 289 | assert len(ac_space.shape) == 1 290 | return DiagGaussianPdType(ac_space.shape[0]) 291 | elif isinstance(ac_space, spaces.Discrete): 292 | return CategoricalPdType(ac_space.n) 293 | elif isinstance(ac_space, spaces.MultiDiscrete): 294 | return MultiCategoricalPdType(ac_space.nvec) 295 | elif isinstance(ac_space, spaces.MultiBinary): 296 | return BernoulliPdType(ac_space.n) 297 | else: 298 | raise NotImplementedError 299 | 300 | def shape_el(v, i): 301 | maybe = v.get_shape()[i] 302 | if maybe is not None: 303 | return maybe 304 | else: 305 | return tf.shape(v)[i] 306 | 307 | @U.in_session 308 | def test_probtypes(): 309 | np.random.seed(0) 310 | 311 | pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8]) 312 | diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101 313 | validate_probtype(diag_gauss, pdparam_diag_gauss) 314 | 315 | pdparam_categorical = np.array([-.2, .3, .5]) 316 | categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101 317 | validate_probtype(categorical, pdparam_categorical) 318 | 319 | nvec = [1,2,3] 320 | pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1]) 321 | multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101 322 | validate_probtype(multicategorical, pdparam_multicategorical) 323 | 324 | pdparam_bernoulli = np.array([-.2, .3, .5]) 325 | bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101 326 | validate_probtype(bernoulli, pdparam_bernoulli) 327 | 328 | 329 | def validate_probtype(probtype, pdparam): 330 | N = 100000 331 | # Check to see if mean negative log likelihood == differential entropy 332 | Mval = np.repeat(pdparam[None, :], N, axis=0) 333 | M = probtype.param_placeholder([N]) 334 | X = probtype.sample_placeholder([N]) 335 | pd = probtype.pdfromflat(M) 336 | calcloglik = U.function([X, M], pd.logp(X)) 337 | calcent = U.function([M], pd.entropy()) 338 | Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) 339 | logliks = calcloglik(Xval, Mval) 340 | entval_ll = - logliks.mean() #pylint: disable=E1101 341 | entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 342 | entval = calcent(Mval).mean() #pylint: disable=E1101 343 | assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas 344 | 345 | # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] 346 | M2 = probtype.param_placeholder([N]) 347 | pd2 = probtype.pdfromflat(M2) 348 | q = pdparam + np.random.randn(pdparam.size) * 0.1 349 | Mval2 = np.repeat(q[None, :], N, axis=0) 350 | calckl = U.function([M, M2], pd.kl(pd2)) 351 | klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 352 | logliks = calcloglik(Xval, Mval2) 353 | klval_ll = - entval - logliks.mean() #pylint: disable=E1101 354 | klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 355 | assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas 356 | print('ok on', probtype, pdparam) 357 | 358 | 359 | def _matching_fc(tensor, name, size, init_scale, init_bias): 360 | if tensor.shape[-1] == size: 361 | return tensor 362 | else: 363 | return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias) 364 | -------------------------------------------------------------------------------- /neural_networks/gym_utilities.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/gym_utilities.pyc -------------------------------------------------------------------------------- /neural_networks/market_making_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Mar 25 21:42:23 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import tensorflow as tf, numpy as np 10 | from collections import deque 11 | 12 | 13 | activation_dictionary = {'elu': tf.nn.elu, 14 | 'relu': tf.nn.relu, 15 | 'selu': tf.nn.selu, 16 | 'sigmoid': tf.nn.sigmoid, 17 | 'softmax': tf.nn.softmax, 18 | None: None} 19 | 20 | def fully_connected_layer(inputs, units, activation, gain=np.sqrt(2)): 21 | 22 | return tf.layers.dense(inputs=inputs, 23 | units=units, 24 | activation=activation_dictionary[activation], 25 | kernel_initializer=tf.orthogonal_initializer(gain)) 26 | 27 | class Memory(): 28 | 29 | def __init__(self, max_size): 30 | self.buffer = deque(maxlen = max_size) 31 | 32 | def add(self, experience): 33 | self.buffer.append(experience) 34 | 35 | def sample(self, batch_size): 36 | buffer_size = len(self.buffer) 37 | index = np.random.choice(np.arange(buffer_size), 38 | size=batch_size, 39 | replace=True) 40 | 41 | return [self.buffer[i] for i in index] 42 | 43 | 44 | class DeepQNetworkMM(): 45 | 46 | def __init__(self, n_units, n_classes, state_size, action_size, learning_rate): 47 | self.state_size = state_size 48 | self.action_size = action_size 49 | self.learning_rate = learning_rate 50 | self.n_units = n_units 51 | self.n_classes = n_classes 52 | 53 | self.input_matrix = tf.placeholder(tf.float32, [None, state_size]) 54 | self.actions = tf.placeholder(tf.float32, [None, n_classes]) 55 | self.target_Q = tf.placeholder(tf.float32, [None]) 56 | 57 | 58 | self.layer1 = fully_connected_layer(inputs=self.input_matrix, 59 | units=self.n_units, 60 | activation='selu') 61 | 62 | self.hidden_layer = fully_connected_layer(inputs=self.layer1, 63 | units=self.n_units, 64 | activation='selu') 65 | 66 | self.output_layer = fully_connected_layer(inputs=self.hidden_layer, 67 | units=n_classes, 68 | activation=None) 69 | 70 | self.predicted_Q = tf.reduce_sum(tf.multiply(self.output_layer, self.actions), axis=1) 71 | 72 | self.error_rate = tf.reduce_mean(tf.square(self.target_Q - self.predicted_Q)) 73 | 74 | self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.error_rate) 75 | 76 | -------------------------------------------------------------------------------- /neural_networks/market_making_models.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/market_making_models.pyc -------------------------------------------------------------------------------- /neural_networks/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Feb 20 21:49:13 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import tensorflow as tf, numpy as np 10 | from baselines.common.distributions import make_pdtype 11 | 12 | activation_dictionary = {'elu': tf.nn.elu, 13 | 'relu': tf.nn.relu, 14 | 'selu': tf.nn.selu, 15 | 'sigmoid': tf.nn.sigmoid, 16 | 'softmax': tf.nn.softmax, 17 | None: None} 18 | 19 | def normalized_columns_initializer(standard_deviation=1.0): 20 | def initializer(shape, dtype=None, partition_info=None): 21 | output = np.random.randn(*shape).astype(np.float32) 22 | output *= standard_deviation/float(np.sqrt(np.square(output).sum(axis=0, keepdims=True))) 23 | return tf.constant(output) 24 | return initializer 25 | 26 | def linear_operation(x, size, name, initializer=None, bias_init=0): 27 | with tf.variable_scope(name): 28 | weights = tf.get_variable("w", [x.get_shape()[1], size], initializer=initializer) 29 | biases = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init)) 30 | return tf.matmul(x, weights) + biases 31 | 32 | def convolution_layer(inputs, dimensions, filters, kernel_size, strides, gain=np.sqrt(2), activation='relu'): 33 | 34 | if dimensions == 3: 35 | 36 | return tf.layers.conv1d(inputs=inputs, 37 | filters=filters, 38 | kernel_size=kernel_size, 39 | kernel_initializer=tf.orthogonal_initializer(gain), 40 | strides=(strides), 41 | activation=activation_dictionary[activation]) 42 | elif dimensions == 4: 43 | 44 | return tf.layers.conv2d(inputs=inputs, 45 | filters=filters, 46 | kernel_size=kernel_size, 47 | kernel_initializer=tf.orthogonal_initializer(gain), 48 | strides=(strides), 49 | activation=activation_dictionary[activation]) 50 | 51 | 52 | def fully_connected_layer(inputs, units, activation, gain=np.sqrt(2)): 53 | return tf.layers.dense(inputs=inputs, 54 | units=units, 55 | activation=activation_dictionary[activation], 56 | kernel_initializer=tf.orthogonal_initializer(gain)) 57 | 58 | def lstm_layer(input, size, actions, apply_softmax=False): 59 | input = tf.expand_dims(input, [0]) 60 | lstm = tf.contrib.rnn.BasicLSTMCell(size, state_is_tuple=True) 61 | state_size = lstm.state_size 62 | step_size = tf.shape(input)[:1] 63 | cell_init = np.zeros((1, state_size.c), np.float32) 64 | hidden_init = np.zeros((1, state_size.h), np.float32) 65 | initial_state = [cell_init, hidden_init] 66 | cell_state = tf.placeholder(tf.float32, [1, state_size.c]) 67 | hidden_state = tf.placeholder(tf.float32, [1, state_size.h]) 68 | input_state = tf.contrib.rnn.LSTMStateTuple(cell_state, hidden_state) 69 | 70 | _outputs, states = tf.nn.dynamic_rnn(cell=lstm, 71 | inupts=input, 72 | initial_state=input_state, 73 | sequence_length=step_size, 74 | time_major=False) 75 | _cell_state, _hidden_state = states 76 | output = tf.reshape(_outputs, [-1, size]) 77 | output_state = [_cell_state[:1, :], _hidden_state[:1, :]] 78 | output = linear_operation(output, actions, "logits", normalized_columns_initializer(0.01)) 79 | output = tf.nn.softmax(output, dim=-1) 80 | return output, _cell_state, _hidden_state 81 | 82 | def create_weights_biases(n_layers, n_units, n_columns, n_outputs): 83 | ''' 84 | Creates dictionaries of variable length for differing neural network models 85 | 86 | Arguments 87 | 88 | n_layers - int - number of layers 89 | n_units - int - number of neurons within each individual layer 90 | n_columns - int - number of columns within dataset 91 | 92 | :return: dict (int), dict (int) 93 | ''' 94 | weights, biases = {}, {} 95 | for i in range(n_layers): 96 | if i == 0: 97 | weights['layer'+str(i)] = tf.Variable(tf.random_normal([n_columns, n_units])) 98 | biases['layer'+str(i)] = tf.Variable(tf.random_normal([n_columns])) 99 | elif i != 0 and i != n_layers-1: 100 | weights['layer'+str(i)] = tf.Variable(tf.random_normal([n_units, n_units])) 101 | biases['layer'+str(i)] = tf.Variable(tf.random_normal([n_units])) 102 | elif i != 0 and i == n_layers-1: 103 | weights['output_layer'] = tf.Variable(tf.random_normal([n_units, n_outputs])) 104 | biases['output_layer'] = tf.Variable(tf.random_normal([n_outputs])) 105 | 106 | return weights, biases 107 | 108 | def create_input_output(input_dtype, output_dtype, n_columns, n_outputs): 109 | ''' 110 | Create placeholder variables for tensorflow graph 111 | 112 | ''' 113 | X = tf.placeholder(shape=(None, n_columns), dtype=input_dtype) 114 | Y = tf.placeholder(shape=(None, n_outputs), dtype=output_dtype) 115 | return X, Y 116 | 117 | 118 | class DeepQNetwork(): 119 | 120 | def __init__(self, n_units, n_classes, n_filters, stride, kernel, state_size, action_size, learning_rate): 121 | self.state_size = state_size 122 | self.action_size = action_size 123 | self.learning_rate = learning_rate 124 | self.n_units = n_units 125 | self.n_classes = n_classes 126 | self.n_filters = n_filters 127 | self.stride = stride 128 | self.kernel = kernel 129 | 130 | self.input_matrix = tf.placeholder(tf.float32, [None, state_size]) 131 | self.actions = tf.placeholder(tf.float32, [None, n_classes]) 132 | self.target_Q = tf.placeholder(tf.float32, [None]) 133 | 134 | 135 | self.network1 = convolution_layer(inputs=self.input_matrix, 136 | filters=self.n_filters, 137 | kernel_size=self.kernel, 138 | strides=self.stride, 139 | dimensions=4, 140 | activation='elu') 141 | 142 | self.network1 = tf.layers.batch_normalization(self.network1, 143 | training=True, 144 | epsilon=1e-5) 145 | 146 | self.network2 = convolution_layer(inputs=self.network1, 147 | filters=self.n_filters*2, 148 | kernel_size=int(self.kernel/2), 149 | strides=int(self.stride/2), 150 | dimensions=4, 151 | activation='elu') 152 | 153 | self.network2 = tf.layers.batch_normalization(inputs=self.network2, 154 | training=True, 155 | epsilon=1e-5) 156 | 157 | self.network3 = convolution_layer(inputs=self.network2, 158 | filters=self.n_filters*4, 159 | kernel_size=int(self.kernel/2), 160 | strides=int(self.stride/2), 161 | dimensions=4, 162 | activation='elu') 163 | 164 | self.network3 = tf.layers.batch_normalization(inputs=self.network3, 165 | training=True, 166 | epsilon=1e-5) 167 | 168 | self.network3 = tf.layers.flatten(inputs=self.network3) 169 | 170 | self.output = fully_connected_layer(inputs=self.network3, 171 | units=self.n_units, 172 | activation='elu') 173 | 174 | self.output = fully_connected_layer(inputs=self.output, 175 | units=n_classes, 176 | activation=None) 177 | 178 | self.predicted_Q = tf.reduce_sum(tf.multiply(self.output, self.actions), axis=1) 179 | 180 | self.error_rate = tf.reduce_mean(tf.square(self.target_Q - self.predicted_Q)) 181 | 182 | self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.error_rate) 183 | 184 | 185 | class ActorCriticModel(): 186 | 187 | def __init__(self, session, environment, action_space, n_batches, n_steps, reuse=False): 188 | 189 | session.run(tf.global_variables_initializer()) 190 | self.distribution_type = make_pdtype(action_space) 191 | height, weight, channel = environment.shape 192 | inputs_ = tf.placeholder(tf.float32, [height, weight, channel], name='inputs') 193 | scaled_images = tf.cast(inputs_, tf.float32)/float(255) 194 | 195 | with tf.variable_scope('model', reuse=reuse): 196 | 197 | layer1 = tf.layers.batch_normalization(convolution_layer(inputs=scaled_images, 198 | filters=32, 199 | kernel_size=8, 200 | strides=4, 201 | dimensions=3)) 202 | 203 | layer2 = tf.layers.batch_normalization(convolution_layer(inputs=tf.nn.relu(layer1), 204 | filters=64, 205 | kernel_size=4, 206 | strides=2, 207 | dimensions=3)) 208 | 209 | layer3 = tf.layers.batch_normalization(convolution_layer(inputs=tf.nn.relu(layer2), 210 | filters=64, 211 | kernel_size=3, 212 | strides=1, 213 | dimensions=3)) 214 | 215 | layer3 = tf.layers.flatten(inputs=layer3) 216 | output_layer = fully_connected_layer(inputs=layer3, units=512, activation='softmax') 217 | self.distribution, self.logits = self.distribution_type.pdfromlatent(output_layer, init_scale=0.01) 218 | value_function = fully_connected_layer(output_layer, units=1, activation=None)[:, 0] 219 | 220 | self.initial_state = None 221 | sampled_action = self.distribution.sample() 222 | 223 | def step(current_state, *_args, **_kwargs): 224 | action, value = session.run([sampled_action, value_function], {inputs_: current_state}) 225 | return action, value 226 | 227 | def value(current_state, *_args, **_kwargs): 228 | return session.run(value_function, {inputs_: current_state}) 229 | 230 | def select_action(current_state, *_args, **_kwargs): 231 | return session.run(sampled_action, {inputs_: current_state}) 232 | 233 | self.inputs_ = inputs_ 234 | self.value_function = value_function 235 | self.step = step 236 | self.value = value 237 | self.select_action = select_action 238 | 239 | 240 | class A3CModel(): 241 | 242 | def __init__(self, s_size, a_size, scope, trainer): 243 | 244 | with tf.variable_scope(scope): 245 | 246 | self.input_layer = tf.placeholder(shape=[None, s_size], 247 | dtype=tf.float32) 248 | 249 | self.input_layer = tf.reshape(self.input_layer, 250 | shape=[-1,84,84,1]) 251 | 252 | self.layer1 = tf.layers.batch_normalization(convolution_layer(inputs=input_layer, 253 | filters=32, 254 | kernel_size=8, 255 | strides=4, 256 | dimensions=3)) 257 | 258 | self.layer2 = tf.layers.batch_normalization(convolution_layer(inputs=tf.nn.relu(layer1), 259 | filters=64, 260 | kernel_size=4, 261 | strides=2, 262 | dimensions=3)) 263 | 264 | layer3 = tf.layers.flatten(inputs=layer3) 265 | 266 | output_layer = fully_connected_layer(inputs=layer3, 267 | units=512, 268 | activation='softmax') 269 | 270 | outputs, cell_state, hidden_state = lstm_layer(input=hidden, 271 | size=s_size, 272 | actions=a_size, 273 | apply_softmax=False) 274 | 275 | self.state_out = (cell_state[:1, :], hidden_state[:1, :]) 276 | ouptut_layer = tf.reshape(outputs, [-1, 256]) 277 | 278 | self.policy = slim.fully_connected(input=output_layer, 279 | n_units=a_size, 280 | activation_fn=tf.nn.softmax, 281 | weights_initializer=normalized_columns_initializer(0.01), 282 | biases_initializer=None) 283 | 284 | self.value = slim.fully_connected(input=rnn_out, 285 | n_units=1, 286 | activation_fn=None, 287 | weights_initializer=normalized_columns_initializer(1.0), 288 | biases_initializer=None) 289 | 290 | if scope != 'global': 291 | self.actions = tf.placeholder(shape=[None],dtype=tf.int32) 292 | self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32) 293 | self.target_v = tf.placeholder(shape=[None],dtype=tf.float32) 294 | self.advantages = tf.placeholder(shape=[None],dtype=tf.float32) 295 | 296 | self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1]) 297 | 298 | self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1]))) 299 | self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy)) 300 | self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages) 301 | self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 302 | 303 | local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 304 | self.gradients = tf.gradients(self.loss, local_vars) 305 | self.var_norms = tf.global_norm(local_vars) 306 | grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0) 307 | 308 | global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') 309 | self.apply_grads = trainer.apply_gradients(zip(grads,global_vars)) 310 | 311 | 312 | 313 | -------------------------------------------------------------------------------- /neural_networks/models.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/models.pyc -------------------------------------------------------------------------------- /neural_networks/policy_gradient_utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Mar 25 15:22:27 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import keras.layers as layers 10 | from keras import backend 11 | from keras.models import Model 12 | from keras.optimizers import Adam 13 | from keras.initializers import glorot_uniform 14 | 15 | class PolicyGradient(): 16 | 17 | def __init__(self, n_units, n_layers, n_columns, n_outputs, learning_rate, hidden_activation, output_activation, loss_function): 18 | self.n_units = n_units 19 | self.n_layers = n_layers 20 | self.n_columns = n_columns 21 | self.n_outputs = n_outputs 22 | self.hidden_activation = hidden_activation 23 | self.output_activation = output_activation 24 | self.learning_rate = learning_rate 25 | self.loss_function = loss_function 26 | 27 | def create_policy_model(self, input_shape): 28 | input_layer = layers.Input(shape=input_shape) 29 | advantages = layers.Input(shape=[1]) 30 | 31 | hidden_layer = layers.Dense(units=self.n_units, 32 | activation=self.hidden_activation, 33 | use_bias=False, 34 | kernel_initializer=glorot_uniform(seed=42))(input_layer) 35 | 36 | output_layer = layers.Dense(units=self.n_outputs, 37 | activation=self.output_activation, 38 | use_bias=False, 39 | kernel_initializer=glorot_uniform(seed=42))(hidden_layer) 40 | 41 | def log_likelihood_loss(actual_labels, predicted_labels): 42 | log_likelihood = backend.log(actual_labels * (actual_labels - predicted_labels) + 43 | (1 - actual_labels) * (actual_labels + predicted_labels)) 44 | return backend.mean(log_likelihood * advantages, keepdims=True) 45 | 46 | if self.loss_function == 'log_likelihood': 47 | self.loss_function = log_likelihood_loss 48 | else: 49 | self.loss_function = 'categorical_crossentropy' 50 | 51 | policy_model = Model(inputs=[input_layer, advantages], outputs=output_layer) 52 | policy_model.compile(loss=self.loss_function, optimizer=Adam(self.learning_rate)) 53 | model_prediction = Model(input=[input_layer], outputs=output_layer) 54 | return policy_model, model_prediction 55 | -------------------------------------------------------------------------------- /neural_networks/policy_gradient_utilities.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/applied-reinforcement-learning-w-python/c10fe7ae9b6c629b18af761f15067e6b8ddaa5d6/neural_networks/policy_gradient_utilities.pyc -------------------------------------------------------------------------------- /neural_networks/untitled4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Feb 25 12:16:24 2019 5 | 6 | @author: tawehbeysolow 7 | """ 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | import gym 13 | env = gym.make("CartPole-v0") 14 | 15 | # Constants defining our neural network 16 | hidden_layer_neurons = 8 17 | gamma = .99 18 | dimen = len(env.reset()) 19 | print_every = 100 20 | batch_size = 50 21 | num_episodes = 10000 22 | render = False 23 | lr = 1e-2 24 | goal = 190 25 | 26 | import keras.layers as layers 27 | from keras.models import Model 28 | from keras.optimizers import Adam 29 | import keras.backend as K 30 | from keras.initializers import glorot_uniform 31 | 32 | def get_policy_model(env, hidden_layer_neurons, lr): 33 | dimen = env.reset().shape 34 | num_actions = env.action_space.n 35 | inp = layers.Input(shape=dimen,name="input_x") 36 | adv = layers.Input(shape=[1], name="advantages") 37 | x = layers.Dense(hidden_layer_neurons, 38 | activation="relu", 39 | use_bias=False, 40 | kernel_initializer=glorot_uniform(seed=42), 41 | name="dense_1")(inp) 42 | out = layers.Dense(num_actions, 43 | activation="softmax", 44 | kernel_initializer=glorot_uniform(seed=42), 45 | use_bias=False, 46 | name="out")(x) 47 | 48 | def custom_loss(y_true, y_pred): 49 | # actual: 0 predict: 0 -> log(0 * (0 - 0) + (1 - 0) * (0 + 0)) = -inf 50 | # actual: 1 predict: 1 -> log(1 * (1 - 1) + (1 - 1) * (1 + 1)) = -inf 51 | # actual: 1 predict: 0 -> log(1 * (1 - 0) + (1 - 1) * (1 + 0)) = 0 52 | # actual: 0 predict: 1 -> log(0 * (0 - 1) + (1 - 0) * (0 + 1)) = 0 53 | log_lik = K.log(y_true * (y_true - y_pred) + (1 - y_true) * (y_true + y_pred)) 54 | return K.mean(log_lik * adv, keepdims=True) 55 | 56 | model_train = Model(inputs=[inp, adv], outputs=out) 57 | model_train.compile(loss=custom_loss, optimizer=Adam(lr)) 58 | model_predict = Model(inputs=[inp], outputs=out) 59 | return model_train, model_predict 60 | 61 | def discount_rewards(r, gamma=0.99): 62 | """Takes 1d float array of rewards and computes discounted reward 63 | e.g. f([1, 1, 1], 0.99) -> [2.9701, 1.99, 1] 64 | """ 65 | prior = 0 66 | out = [] 67 | for val in r: 68 | new_val = val + prior * gamma 69 | out.append(new_val) 70 | prior = new_val 71 | return np.array(out[::-1]) 72 | 73 | # See our trained bot in action 74 | def score_model(model, num_tests, render=False): 75 | scores = [] 76 | for num_test in range(num_tests): 77 | observation = env.reset() 78 | reward_sum = 0 79 | while True: 80 | if render: 81 | env.render() 82 | 83 | state = np.reshape(observation, [1, dimen]) 84 | predict = model.predict([state])[0] 85 | action = np.argmax(predict) 86 | observation, reward, done, _ = env.step(action) 87 | reward_sum += reward 88 | if done: 89 | break 90 | scores.append(reward_sum) 91 | env.close() 92 | return np.mean(scores) 93 | 94 | model_train, model_predict = get_policy_model(env, hidden_layer_neurons, lr) 95 | model_predict.summary() 96 | 97 | reward_sum = 0 98 | 99 | num_actions = env.action_space.n 100 | 101 | # Placeholders for our observations, outputs and rewards 102 | states = np.empty(0).reshape(0,dimen) 103 | actions = np.empty(0).reshape(0,1) 104 | rewards = np.empty(0).reshape(0,1) 105 | discounted_rewards = np.empty(0).reshape(0,1) 106 | 107 | # Setting up our environment 108 | observation = env.reset() 109 | 110 | num_episode = 0 111 | 112 | losses = [] 113 | 114 | while num_episode < num_episodes: 115 | # Append the observations to our batch 116 | state = np.reshape(observation, [1, dimen]) 117 | 118 | predict = model_predict.predict([state])[0] 119 | action = np.random.choice(range(num_actions),p=predict) 120 | 121 | # Append the observations and outputs for learning 122 | states = np.vstack([states, state]) 123 | actions = np.vstack([actions, action]) 124 | 125 | # Determine the oucome of our action 126 | observation, reward, done, _ = env.step(action) 127 | reward_sum += reward 128 | rewards = np.vstack([rewards, reward]) 129 | 130 | if done: 131 | # Determine standardized rewards 132 | discounted_rewards_episode = discount_rewards(rewards, gamma) 133 | discounted_rewards = np.vstack([discounted_rewards, discounted_rewards_episode]) 134 | 135 | rewards = np.empty(0).reshape(0,1) 136 | 137 | if (num_episode + 1) % batch_size == 0: 138 | discounted_rewards -= discounted_rewards.mean() 139 | discounted_rewards /= discounted_rewards.std() 140 | discounted_rewards = discounted_rewards.squeeze() 141 | actions = actions.squeeze().astype(int) 142 | 143 | actions_train = np.zeros([len(actions), num_actions]) 144 | actions_train[np.arange(len(actions)), actions] = 1 145 | 146 | loss = model_train.train_on_batch([states, discounted_rewards], actions_train) 147 | losses.append(loss) 148 | 149 | # Clear out game variables 150 | states = np.empty(0).reshape(0,dimen) 151 | actions = np.empty(0).reshape(0,1) 152 | discounted_rewards = np.empty(0).reshape(0,1) 153 | 154 | 155 | # Print periodically 156 | if (num_episode + 1) % print_every == 0: 157 | # Print status 158 | score = score_model(model_predict,10) 159 | print("Average reward for training episode {}: {:0.2f} Test Score: {:0.2f} Loss: {:0.6f} ".format( 160 | (num_episode + 1), reward_sum/print_every, 161 | score, 162 | np.mean(losses[-print_every:]))) 163 | 164 | if score >= goal: 165 | print("Solved in {} episodes!".format(num_episode)) 166 | break 167 | reward_sum = 0 168 | 169 | num_episode += 1 170 | observation = env.reset() 171 | 172 | 173 | 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym 2 | box2d-py 3 | vizdoom 4 | tensorflow-gpu 5 | baselines 6 | collections 7 | keras 8 | --------------------------------------------------------------------------------