├── .gitignore
├── LICENSE
├── README.md
├── code
    ├── bandits
    │   ├── __init__.py
    │   ├── actors.py
    │   ├── agents.py
    │   ├── estimators.py
    │   └── utils.py
    ├── constants.py
    ├── exercises
    │   ├── __init__.py
    │   ├── ex_11_3
    │   │   ├── __init__.py
    │   │   └── q_learning_bairds_counter_example.py
    │   ├── ex_2_11
    │   │   ├── __init__.py
    │   │   ├── analysis.py
    │   │   └── run.py
    │   ├── ex_2_5
    │   │   ├── __init__.py
    │   │   ├── analysis.py
    │   │   └── run.py
    │   ├── ex_4_7
    │   │   ├── __init__.py
    │   │   ├── analysis.py
    │   │   ├── jacks_car_rental
    │   │   │   ├── __init__.py
    │   │   │   └── model.py
    │   │   └── run.py
    │   ├── ex_4_9
    │   │   ├── __init__.py
    │   │   └── gamblers_problem.py
    │   ├── ex_5_10
    │   │   ├── __init__.py
    │   │   ├── analyse.py
    │   │   ├── model.py
    │   │   ├── run.py
    │   │   └── utils.py
    │   ├── ex_6_10
    │   │   ├── __init__.py
    │   │   └── stochastic_windy_gridworld.py
    │   ├── ex_6_9
    │   │   ├── __init__.py
    │   │   └── windy_gridworld.py
    │   ├── ex_7_2
    │   │   ├── __init__.py
    │   │   └── comparison.py
    │   ├── ex_8_4
    │   │   ├── __init__.py
    │   │   └── dynaq_gridworld_comparison.py
    │   ├── ex_8_8
    │   │   ├── __init__.py
    │   │   └── update_distribution_example.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── ex_2_5
    │   │   │   └── __init__.py
    │   └── utils.py
    ├── generic
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   └── dyna_q.py
    │   ├── environments.py
    │   ├── policies.py
    │   ├── updates.py
    │   └── utils.py
    ├── plotting.py
    ├── requirements.txt
    └── tests
    │   └── bandits
    │       └── crappy_tests.py
├── exercises
    ├── chapters
    │   ├── chapter1
    │   │   └── chapter1_content.tex
    │   ├── chapter10
    │   │   ├── chapter10.pdf
    │   │   ├── chapter10.tex
    │   │   └── chapter10_content.tex
    │   ├── chapter11
    │   │   ├── chapter11.pdf
    │   │   ├── chapter11.tex
    │   │   └── chapter11_content.tex
    │   ├── chapter12
    │   │   ├── chapter12.tex
    │   │   └── chapter12_content.tex
    │   ├── chapter13
    │   │   ├── chapter13.pdf
    │   │   ├── chapter13.tex
    │   │   └── chapter13_content.tex
    │   ├── chapter14
    │   │   └── chapter14_content.tex
    │   ├── chapter15
    │   │   └── chapter15_content.tex
    │   ├── chapter16
    │   │   └── chapter16_content.tex
    │   ├── chapter17
    │   │   └── chapter17_content.tex
    │   ├── chapter2
    │   │   └── chapter2_content.tex
    │   ├── chapter3
    │   │   └── chapter3_content.tex
    │   ├── chapter4
    │   │   └── chapter4_content.tex
    │   ├── chapter5
    │   │   └── chapter5_content.tex
    │   ├── chapter6
    │   │   └── chapter6_content.tex
    │   ├── chapter7
    │   │   ├── chapter7.pdf
    │   │   ├── chapter7.tex
    │   │   └── chapter7_content.tex
    │   ├── chapter8
    │   │   ├── chapter8.pdf
    │   │   ├── chapter8.tex
    │   │   └── chapter8_content.tex
    │   └── chapter9
    │   │   ├── chapter9.pdf
    │   │   ├── chapter9.tex
    │   │   └── chapter9_content.tex
    ├── exercises.pdf
    └── exercises.tex
├── header.tex
├── notes
    ├── chapters
    │   ├── chapter1
    │   │   └── chapter1_content.tex
    │   ├── chapter10
    │   │   ├── chapter10.pdf
    │   │   ├── chapter10.tex
    │   │   └── chapter10_content.tex
    │   ├── chapter11
    │   │   ├── chapter11.pdf
    │   │   ├── chapter11.tex
    │   │   └── chapter11_content.tex
    │   ├── chapter12
    │   │   └── chapter12_content.tex
    │   ├── chapter13
    │   │   ├── chapter13.pdf
    │   │   ├── chapter13.tex
    │   │   └── chapter13_content.tex
    │   ├── chapter14
    │   │   └── chapter14_content.tex
    │   ├── chapter15
    │   │   └── chapter15_content.tex
    │   ├── chapter16
    │   │   └── chapter16_content.tex
    │   ├── chapter17
    │   │   └── chapter17_content.tex
    │   ├── chapter2
    │   │   └── chapter2_content.tex
    │   ├── chapter3
    │   │   └── chapter3_content.tex
    │   ├── chapter4
    │   │   └── chapter4_content.tex
    │   ├── chapter5
    │   │   └── chapter5_content.tex
    │   ├── chapter6
    │   │   ├── chapter6.pdf
    │   │   ├── chapter6.tex
    │   │   └── chapter6_content.tex
    │   ├── chapter7
    │   │   ├── chapter7.pdf
    │   │   ├── chapter7.tex
    │   │   └── chapter7_content.tex
    │   ├── chapter8
    │   │   ├── .chapter.tex.swp
    │   │   ├── chapter8.pdf
    │   │   ├── chapter8.tex
    │   │   └── chapter8_content.tex
    │   └── chapter9
    │   │   ├── chapter9.pdf
    │   │   ├── chapter9.tex
    │   │   └── chapter9_content.tex
    ├── notes.pdf
    └── notes.tex
└── todo.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | data/**/*
  2 | /**/__pycache__
  3 | 
  4 | ## Core latex/pdflatex auxiliary files:
  5 | *.aux
  6 | *.lof
  7 | *.log
  8 | *.lot
  9 | *.fls
 10 | *.out
 11 | *.toc
 12 | *.fmt
 13 | *.fot
 14 | *.cb
 15 | *.cb2
 16 | .*.lb
 17 | 
 18 | ## Intermediate documents:
 19 | *.dvi
 20 | *.xdv
 21 | *-converted-to.*
 22 | # these rules might exclude image files for figures etc.
 23 | # *.ps
 24 | # *.eps
 25 | # *.pdf
 26 | 
 27 | ## Generated if empty string is given at "Please type another file name for output:"
 28 | .pdf
 29 | 
 30 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
 31 | *.bbl
 32 | *.bcf
 33 | *.blg
 34 | *-blx.aux
 35 | *-blx.bib
 36 | *.run.xml
 37 | 
 38 | ## Build tool auxiliary files:
 39 | *.fdb_latexmk
 40 | *.synctex
 41 | *.synctex(busy)
 42 | *.synctex.gz
 43 | *.synctex.gz(busy)
 44 | *.pdfsync
 45 | 
 46 | ## Auxiliary and intermediate files from other packages:
 47 | # algorithms
 48 | *.alg
 49 | *.loa
 50 | 
 51 | # achemso
 52 | acs-*.bib
 53 | 
 54 | # amsthm
 55 | *.thm
 56 | 
 57 | # beamer
 58 | *.nav
 59 | *.pre
 60 | *.snm
 61 | *.vrb
 62 | 
 63 | # changes
 64 | *.soc
 65 | 
 66 | # cprotect
 67 | *.cpt
 68 | 
 69 | # elsarticle (documentclass of Elsevier journals)
 70 | *.spl
 71 | 
 72 | # endnotes
 73 | *.ent
 74 | 
 75 | # fixme
 76 | *.lox
 77 | 
 78 | # feynmf/feynmp
 79 | *.mf
 80 | *.mp
 81 | *.t[1-9]
 82 | *.t[1-9][0-9]
 83 | *.tfm
 84 | 
 85 | #(r)(e)ledmac/(r)(e)ledpar
 86 | *.end
 87 | *.?end
 88 | *.[1-9]
 89 | *.[1-9][0-9]
 90 | *.[1-9][0-9][0-9]
 91 | *.[1-9]R
 92 | *.[1-9][0-9]R
 93 | *.[1-9][0-9][0-9]R
 94 | *.eledsec[1-9]
 95 | *.eledsec[1-9]R
 96 | *.eledsec[1-9][0-9]
 97 | *.eledsec[1-9][0-9]R
 98 | *.eledsec[1-9][0-9][0-9]
 99 | *.eledsec[1-9][0-9][0-9]R
100 | 
101 | # glossaries
102 | *.acn
103 | *.acr
104 | *.glg
105 | *.glo
106 | *.gls
107 | *.glsdefs
108 | 
109 | # gnuplottex
110 | *-gnuplottex-*
111 | 
112 | # gregoriotex
113 | *.gaux
114 | *.gtex
115 | 
116 | # htlatex
117 | *.4ct
118 | *.4tc
119 | *.idv
120 | *.lg
121 | *.trc
122 | *.xref
123 | 
124 | # hyperref
125 | *.brf
126 | 
127 | # knitr
128 | *-concordance.tex
129 | # TODO Comment the next line if you want to keep your tikz graphics files
130 | *.tikz
131 | *-tikzDictionary
132 | 
133 | # listings
134 | *.lol
135 | 
136 | # makeidx
137 | *.idx
138 | *.ilg
139 | *.ind
140 | *.ist
141 | 
142 | # minitoc
143 | *.maf
144 | *.mlf
145 | *.mlt
146 | *.mtc[0-9]*
147 | *.slf[0-9]*
148 | *.slt[0-9]*
149 | *.stc[0-9]*
150 | 
151 | # minted
152 | _minted*
153 | *.pyg
154 | 
155 | # morewrites
156 | *.mw
157 | 
158 | # nomencl
159 | *.nlo
160 | 
161 | # pax
162 | *.pax
163 | 
164 | # pdfpcnotes
165 | *.pdfpc
166 | 
167 | # sagetex
168 | *.sagetex.sage
169 | *.sagetex.py
170 | *.sagetex.scmd
171 | 
172 | # scrwfile
173 | *.wrt
174 | 
175 | # sympy
176 | *.sout
177 | *.sympy
178 | sympy-plots-for-*.tex/
179 | 
180 | # pdfcomment
181 | *.upa
182 | *.upb
183 | 
184 | # pythontex
185 | *.pytxcode
186 | pythontex-files-*/
187 | 
188 | # thmtools
189 | *.loe
190 | 
191 | # TikZ & PGF
192 | *.dpth
193 | *.md5
194 | *.auxlock
195 | 
196 | # todonotes
197 | *.tdo
198 | 
199 | # easy-todo
200 | *.lod
201 | 
202 | # xindy
203 | *.xdy
204 | 
205 | # xypic precompiled matrices
206 | *.xyc
207 | 
208 | # endfloat
209 | *.ttt
210 | *.fff
211 | 
212 | # Latexian
213 | TSWLatexianTemp*
214 | 
215 | ## Editors:
216 | # WinEdt
217 | *.bak
218 | *.sav
219 | 
220 | # Texpad
221 | .texpadtmp
222 | 
223 | # Kile
224 | *.backup
225 | 
226 | # KBibTeX
227 | *~[0-9]*
228 | 
229 | # auto folder when using emacs and auctex
230 | ./auto/*
231 | *.el
232 | 
233 | # expex forward references with \gathertags
234 | *-tags.tex
235 | 
236 | # standalone packages
237 | *.sta
238 | 
239 | 
240 | ### JETBRAINS
241 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
242 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
243 | 
244 | code/.idea/*
245 | code.idea/**/*
246 | 
247 | # User-specific stuff
248 | .idea/**/workspace.xml
249 | .idea/**/tasks.xml
250 | .idea/dictionaries
251 | 
252 | # Sensitive or high-churn files
253 | .idea/**/dataSources/
254 | .idea/**/dataSources.ids
255 | .idea/**/dataSources.local.xml
256 | .idea/**/sqlDataSources.xml
257 | .idea/**/dynamic.xml
258 | .idea/**/uiDesigner.xml
259 | 
260 | # Gradle
261 | .idea/**/gradle.xml
262 | .idea/**/libraries
263 | 
264 | # CMake
265 | cmake-build-debug/
266 | cmake-build-release/
267 | 
268 | # Mongo Explorer plugin
269 | .idea/**/mongoSettings.xml
270 | 
271 | # File-based project format
272 | *.iws
273 | 
274 | # IntelliJ
275 | out/
276 | 
277 | # mpeltonen/sbt-idea plugin
278 | .idea_modules/
279 | 
280 | # JIRA plugin
281 | atlassian-ide-plugin.xml
282 | 
283 | # Cursive Clojure plugin
284 | .idea/replstate.xml
285 | 
286 | # Crashlytics plugin (for Android Studio and IntelliJ)
287 | com_crashlytics_export_strings.xml
288 | crashlytics.properties
289 | crashlytics-build.properties
290 | fabric.properties
291 | 
292 | # Editor-based Rest Client
293 | .idea/httpRequests


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 brynhayder
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement Learning: An Introduction
 2 | Notes and exercise solutions to the second edition of Sutton &amp; Barto's book.
 3 | 
 4 | **Some of the solutions have mistakes, so be mindful! If you spot a mistake, please raise an issue and (at some point...) I'll fix it.**
 5 | 
 6 | -----
 7 | 
 8 | Notes: 
 9 | - The code has been refactored as I've gone along, so some of the earlier exercises might break/have code duplicated elsewhere
10 | - I used the online draft, so the numbering of sections, equations and exercises might not be consistent with the published version of the book. This is unfortunate, but I don't have time to correct it. 
11 | - If there is a typo/mistake somewhere then let me know
12 | 


--------------------------------------------------------------------------------
/code/bandits/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 11/04/2018 18:29
 6 | ---------------------------------
 7 | 
 8 | """
 9 | from .actors import *
10 | from .agents import *
11 | from .estimators import *
12 | 


--------------------------------------------------------------------------------
/code/bandits/actors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 11/04/2018 18:24
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import abc
10 | 
11 | import numpy as np
12 | 
13 | 
14 | __all__ = ['EpsilonGreedyActor']
15 | 
16 | 
17 | class BaseActor(object, metaclass=abc.ABCMeta):
18 |     @abc.abstractmethod
19 |     def action(self, optimal_actions):
20 |         pass
21 | 
22 | 
23 | # is it worth maintaining more than one random state? one for actions and one for exploration?
24 | class EpsilonGreedyActor(BaseActor):
25 |     def __init__(self, n_actions, epsilon=0.01, random_state=None):
26 |         self.n_actions = n_actions
27 |         self.epsilon = epsilon
28 |         self.random_state = random_state or np.random.RandomState(seed=0)
29 | 
30 |         self.possible_actions = np.arange(self.n_actions)
31 |         self.explore = None
32 | 
33 |     def update(self):
34 |         self.explore = self.random_state.binomial(n=1, p=self.epsilon)
35 |         return None
36 | 
37 |     def explorative_action(self):
38 |         return self.random_state.choice(self.possible_actions, 1)[0]
39 | 
40 |     def exploitative_action(self, optimal_actions):
41 |         if len(optimal_actions) == 1:
42 |             return optimal_actions[0]
43 |         else:
44 |             return self.random_state.choice(optimal_actions, 1)[0]
45 | 
46 |     def action(self, optimal_actions):
47 |         self.update()
48 |         if self.explore:
49 |             return self.explorative_action()
50 |         else:
51 |             return self.exploitative_action(optimal_actions)
52 | 
53 | 


--------------------------------------------------------------------------------
/code/bandits/agents.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 11/04/2018 18:27
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import numpy as np
10 | 
11 | __all__ = ['ActionValueBanditAgent']
12 | 
13 | 
14 | class ActionValueBanditAgent(object):
15 |     def __init__(self, estimators, actor):
16 |         """
17 |         Agent for bandit problems.
18 | 
19 |         Args:
20 |             estimators np.array of BaseEstimators: The value estimators.
21 |             actor BaseActor: The thing to choose the actions.
22 |             possible_actions np.array: The actions you can take.
23 | 
24 |         The order of the arguments is essential. Everything is done on the indices.
25 |         """
26 |         self.estimators = estimators
27 |         self.actor = actor
28 | 
29 |     def was_exploring(self):
30 |         return self.actor.explore
31 | 
32 |     def update(self, action, reward):
33 |         self.estimators[action].update(reward)
34 |         return None
35 | 
36 |     def get_estimates(self):
37 |         return np.array([x.value for x in self.estimators])
38 | 
39 |     def get_optimal_actions(self):
40 |         values = self.get_estimates()
41 |         return np.where(values == max(values))[0]
42 | 
43 |     def action(self):
44 |         optimal_actions = self.get_optimal_actions()
45 |         return self.actor.action(optimal_actions)
46 | 


--------------------------------------------------------------------------------
/code/bandits/estimators.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 11/04/2018 18:24
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import abc
10 | 
11 | 
12 | __all__ = ['SampleAverageEstimator', 'ExponentialRecencyWeightedEstimator']
13 | 
14 | 
15 | class BaseEstimator(object, metaclass=abc.ABCMeta):
16 |     @abc.abstractmethod
17 |     def update(self, reward):
18 |         return None
19 | 
20 | 
21 | class SampleAverageEstimator(BaseEstimator):
22 |     def __init__(self, default_value):
23 |         self.initial_value = default_value
24 |         self.value = default_value
25 | 
26 |         self.n_updates = 0
27 | 
28 |     def update(self, reward):
29 |         if self.n_updates == 0:
30 |             self.value = reward
31 |         else:
32 |             self.value += (reward - self.value) / self.n_updates
33 | 
34 |         self.n_updates += 1
35 |         return None
36 | 
37 | 
38 | class ExponentialRecencyWeightedEstimator(BaseEstimator):
39 |     def __init__(self, step_size, initial_value):
40 |         self.step_size = step_size
41 |         self.value = initial_value
42 | 
43 |         self.n_updates = 0
44 | 
45 |     def update(self, reward):
46 |         self.value += self.step_size * (reward - self.value)
47 |         self.n_updates += 1
48 |         return None
49 | 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/code/bandits/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 24/05/2018 11:08
 6 | ---------------------------------
 7 | 
 8 | """
 9 | from types import SimpleNamespace
10 | 
11 | import numpy as np
12 | 
13 | 
14 | class RandomWalkingValueSampler(object):
15 |     def __init__(self, n_steps, n_bandits, loc=0., scale=0.01, random_state=None):
16 |         self.n_steps = n_steps
17 |         self.n_bandits = n_bandits
18 |         self.loc = loc
19 |         self.scale = scale
20 |         self.random_state = random_state or np.random.RandomState(seed=0)
21 | 
22 |     def get_innovations_starting_with_zero(self):
23 |         innovations = self.random_state.normal(
24 |                 loc=self.loc,
25 |                 scale=self.scale,
26 |                 size=(self.n_steps, self.n_bandits)
27 |         )
28 |         innovations[0, :] = 0
29 |         return innovations
30 | 
31 |     def sample(self, initial_values):
32 |         return np.atleast_2d(initial_values) + np.cumsum(self.get_innovations_starting_with_zero(), axis=0)
33 | 
34 |     __call__ = sample
35 | 
36 | 
37 | def run_single(agent, samples):
38 |     choices = list()
39 |     explore = list()
40 |     for row in samples:
41 |         choice = agent.action()
42 |         explore.append(agent.was_exploring())
43 |         choices.append(choice)
44 |         reward = row[choice]
45 |         agent.update(choice, reward)
46 |     return SimpleNamespace(
47 |             choices=np.array(choices),
48 |             explore=np.array(explore),
49 |             optimal=np.argmax(samples, axis=1)
50 |     )
51 | 


--------------------------------------------------------------------------------
/code/constants.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 11/04/2018 18:02
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import os
10 | 
11 | 
12 | class Paths(object):
13 |     data = os.path.abspath(
14 |             os.path.join(
15 |                     os.path.dirname(os.path.realpath(__file__)),
16 |                     os.pardir,
17 |                     'data'
18 |             )
19 |     )
20 | 
21 |     output = os.path.join(data, 'exercise_output')
22 |     input = os.path.join(data, 'exercise_input')
23 | 


--------------------------------------------------------------------------------
/code/exercises/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 11/04/2018 15:31
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_11_3/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 24/07/2018 11:46
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_11_3/q_learning_bairds_counter_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 24/07/2018 11:47
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import os
10 | 
11 | import numpy as np
12 | import matplotlib; matplotlib.use("TkAgg")
13 | import matplotlib.pyplot as plt
14 | 
15 | import plotting
16 | import constants as c
17 | 
18 | 
19 | def q(state, w):
20 |     if state < 6:
21 |         return 2 * w[state] + w[-1]
22 |     else:
23 |         return w[state] + 2 * w[-1]
24 | 
25 | 
26 | def feature(state):
27 |     out = np.zeros(8)
28 |     if state < 6:
29 |         out[state] = 2
30 |         out[-1] = 1
31 |         return out
32 |     else:
33 |         out[state] = 1
34 |         out[-1] = 2
35 |         return out
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     n_steps = 250
40 |     random_state = np.random.RandomState(seed=0)
41 | 
42 |     states = np.arange(7)
43 |     weights = np.array([1, 1, 1, 1, 1, 1, 10, 1])
44 |     alpha = 0.01
45 |     gamma = 0.99
46 | 
47 |     weights_list = [weights]
48 |     for i in range(n_steps):
49 |         s = random_state.choice(states)
50 |         weights = weights + 7 * alpha * (gamma * q(states[-1], weights) - q(s, weights)) * feature(s)
51 |         weights_list.append(weights)
52 | 
53 |     output = np.c_[weights_list]
54 | 
55 |     with plt.rc_context(plotting.rc()):
56 |         fig, ax = plt.subplots(1)
57 |         lines = ax.plot(output)
58 |         ax.legend(lines, [f"w{i+1}" for i in range(output.shape[1])])
59 |         ax.grid(alpha=0.1)
60 |         ax.set_xlabel("Steps")
61 |         ax.set_ylabel("Weight")
62 |         ax.set_title("Q-learning on Baird's Counterexample")
63 |         plt.tight_layout()
64 | 
65 |         plotting.savefig(
66 |                 fig,
67 |                 path=os.path.join(
68 |                         c.Paths.output,
69 |                         "ex_11_3",
70 |                         "bairds_counter_example_q_learning.png"
71 |                 )
72 |         )
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/code/exercises/ex_2_11/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 11/04/2018 17:46
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_2_11/analysis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 13/04/2018 14:43
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import os
10 | 
11 | import matplotlib; matplotlib.use('svg')
12 | import matplotlib.pyplot as plt
13 | import pandas as pd
14 | 
15 | import constants as c
16 | import plotting
17 | 
18 | if __name__ == '__main__':
19 |     samples = pd.read_pickle(
20 |             os.path.join(
21 |                     c.Paths.output,
22 |                     'ex_2_11',
23 |                     'samples.pkl'
24 |             )
25 |     )
26 | 
27 |     results = pd.read_pickle(
28 |             os.path.join(
29 |                     c.Paths.output,
30 |                     'ex_2_11',
31 |                     'results.pkl'
32 |             )
33 |     )
34 | 
35 |     with plt.rc_context(plotting.rc()):
36 |         fig, ax = plt.subplots(1)
37 |         samples.plot(ax=ax)
38 |         ax.legend(
39 |                 title='Actions',
40 |                 bbox_to_anchor=(1, 1),
41 |                 loc='upper left'
42 |         )
43 |         ax.grid(alpha=0.25)
44 |         ax.set_xlabel('$t$')
45 |         ax.set_ylabel('Action Values')
46 |         ax.set_title('True Action Values on 10-Armed Bandit')
47 |         # plt.tight_layout()
48 |         fig.savefig(
49 |                 os.path.join(
50 |                     c.Paths.output,
51 |                     'ex_2_11',
52 |                     'action_values.png'
53 |                 ),
54 |                 bbox_inches='tight'
55 |         )
56 | 
57 |         fig, ax = plt.subplots(1)
58 |         results.plot(ax=ax)
59 |         ax.set_xscale('log', basex=2)
60 |         ax.set_xlabel(r'$\varepsilon$')
61 |         ax.set_ylabel('Proportion Optimal Choice')
62 |         ax.set_title(r'Parameter Study of $\varepsilon$-greedy Action Value Agent on 10-Armed Test Bed')
63 |         ax.grid(alpha=0.25)
64 |         # plt.tight_layout()
65 |         fig.savefig(
66 |                 os.path.join(
67 |                     c.Paths.output,
68 |                     'ex_2_11',
69 |                     'parameter_study.png'
70 |                 ),
71 |                 bbox_inches='tight'
72 |         )
73 | 
74 | 


--------------------------------------------------------------------------------
/code/exercises/ex_2_11/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 13/04/2018 14:01
  6 | ---------------------------------
  7 | 
  8 | """
  9 | from concurrent.futures import ProcessPoolExecutor
 10 | import os
 11 | 
 12 | import matplotlib; matplotlib.use('TKAgg')
 13 | import numpy as np
 14 | import pandas as pd
 15 | 
 16 | from bandits import (
 17 |     ExponentialRecencyWeightedEstimator,
 18 |     SampleAverageEstimator,
 19 |     EpsilonGreedyActor,
 20 |     ActionValueBanditAgent,
 21 |     utils
 22 | )
 23 | import constants as c
 24 | 
 25 | 
 26 | N_STEPS = 200000
 27 | N_BANDITS = 10
 28 | PARAMS = 2. ** np.arange(-7, -1)
 29 | INITIAL_VALUE = 0.
 30 | STEP_SIZE = 0.1
 31 | 
 32 | 
 33 | def process_outputs(output):
 34 |     grades = output.choices == output.optimal
 35 |     _, second_half = np.array_split(grades, 2)
 36 |     return np.mean(second_half)
 37 | 
 38 | 
 39 | def evaluate_single_agent(agent, samples):
 40 |     return process_outputs(
 41 |             utils.run_single(
 42 |                     agent,
 43 |                     samples
 44 |             )
 45 |     )
 46 | 
 47 | 
 48 | if __name__ == "__main__":
 49 |     sampler = utils.RandomWalkingValueSampler(
 50 |             n_steps=N_STEPS,
 51 |             n_bandits=N_BANDITS,
 52 |             loc=0.,
 53 |             scale=0.01,
 54 |             random_state=np.random.RandomState(seed=0)
 55 |     )
 56 | 
 57 |     samples = sampler.sample(initial_values=np.zeros(N_BANDITS))
 58 | 
 59 |     sample_average_outputs = dict()
 60 |     constant_step_outputs = dict()
 61 |     with ProcessPoolExecutor(4) as executor:
 62 |         for param in PARAMS:
 63 |             print('Sumbitting', param)
 64 | 
 65 |             sample_average_agent = ActionValueBanditAgent(
 66 |                     estimators=[
 67 |                         SampleAverageEstimator(INITIAL_VALUE)
 68 |                         for _ in range(N_BANDITS)
 69 |                     ],
 70 |                     actor=EpsilonGreedyActor(
 71 |                             epsilon=param,
 72 |                             n_actions=N_BANDITS,
 73 |                             random_state=np.random.RandomState(seed=1)
 74 |                     )
 75 |             )
 76 | 
 77 |             sample_average_outputs[param] = executor.submit(evaluate_single_agent, sample_average_agent, samples)
 78 | 
 79 |             constant_step_agent = ActionValueBanditAgent(
 80 |                     estimators=[
 81 |                         ExponentialRecencyWeightedEstimator(step_size=STEP_SIZE, initial_value=INITIAL_VALUE)
 82 |                         for _ in range(N_BANDITS)
 83 |                     ],
 84 |                     actor=EpsilonGreedyActor(
 85 |                             epsilon=param,
 86 |                             n_actions=N_BANDITS,
 87 |                             random_state=np.random.RandomState(seed=1)
 88 |                     )
 89 |             )
 90 | 
 91 |             constant_step_outputs[param] = executor.submit(evaluate_single_agent, constant_step_agent, samples)
 92 | 
 93 |     print('Waiting on results')
 94 |     sample_average_outputs = {k: v.result() for k, v in sample_average_outputs.items()}
 95 |     constant_step_outputs = {k: v.result() for k, v in constant_step_outputs.items()}
 96 | 
 97 |     results = pd.concat(
 98 |                     [
 99 |                         pd.Series(sample_average_outputs, name='Sample Average'),
100 |                         pd.Series(constant_step_outputs, name='Constant Step')
101 |                     ],
102 |                     axis=1
103 |             )
104 | 
105 |     pd.DataFrame(samples).to_pickle(
106 |             os.path.join(
107 |                     c.Paths.output,
108 |                     'ex_2_11',
109 |                     'samples.pkl'
110 |             )
111 |     )
112 | 
113 |     results.to_pickle(
114 |             os.path.join(
115 |                     c.Paths.output,
116 |                     'ex_2_11',
117 |                     'results.pkl'
118 |             )
119 |     )
120 | 


--------------------------------------------------------------------------------
/code/exercises/ex_2_5/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 11/04/2018 17:46
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_2_5/analysis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 11/04/2018 18:15
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import os
10 | 
11 | import matplotlib; matplotlib.use('TkAgg')
12 | import matplotlib.pyplot as plt
13 | import pandas as pd
14 | 
15 | import constants as c
16 | import plotting
17 | 
18 | 
19 | # Make the charts asked for in the thing
20 | # also make some charts of how the values converge as the real ones move
21 | # but for this you'll need the samples!
22 | 
23 | 
24 | def load_file(name):
25 |     return pd.read_pickle(
26 |             os.path.join(c.Paths.output, 'ex_2_5', name),
27 |     ).rename(columns=int)
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     epsilon = 0.1
32 |     estimator_type = 'ExponentialRecencyWeightedEstimator'.lower()
33 | 
34 |     all_exponential_choices = load_file(
35 |             r'choices_{}_eps{}.pkl'.format(
36 |                     'ExponentialRecencyWeightedEstimator'.lower(),
37 |                     epsilon
38 |             )
39 |     )
40 | 
41 |     all_average_choices = load_file(
42 |             r'choices_{}_eps{}.pkl'.format('sampleaverageestimator', epsilon)
43 |     )
44 | 
45 |     all_optimal = load_file(r'optimal.pkl')
46 | 
47 |     perc_average_optimal = all_average_choices.eq(all_optimal).expanding().mean()
48 |     perc_exponential_optimal = all_exponential_choices.eq(all_optimal).expanding().mean()
49 | 
50 |     with plt.rc_context(plotting.rc()):
51 |         fig, ax = plt.subplots(1)
52 |         ax.plot(perc_average_optimal.mean(1), label='Sample Average Method')
53 |         ax.plot(perc_exponential_optimal.mean(1), label='Exponential Recency Weighted Method')
54 |         print('ready')
55 | 
56 |         ax.grid(alpha=0.25)
57 |         ax.legend(loc='lower right')
58 |         ax.set_title('Comparison of Estimation Methods on 10-Bandit Test Bed')
59 |         ax.set_xlabel(r'Number of Iterations')
60 |         ax.set_ylabel(r'% Optimal Choices (Cumulative)')
61 |         plt.tight_layout()
62 |         fig.savefig(
63 |                 os.path.join(
64 |                         c.Paths.output,
65 |                         'ex_2_5',
66 |                         'learning_curve.png'
67 |                 )
68 |         )
69 | 


--------------------------------------------------------------------------------
/code/exercises/ex_2_5/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 10/04/2018 17:39
  6 | ---------------------------------
  7 | 
  8 | """
  9 | import os
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | 
 14 | import constants as c
 15 | from bandits import SampleAverageEstimator, ExponentialRecencyWeightedEstimator, ActionValueBanditAgent, EpsilonGreedyActor
 16 | from bandits import utils
 17 | 
 18 | from concurrent.futures import ProcessPoolExecutor
 19 | 
 20 | # apparently these are thread safe so we are okay
 21 | AGENT_RANDOM_STATE = np.random.RandomState(seed=1)
 22 | 
 23 | N_STEPS = int(1e5)
 24 | N_BANDITS = 10
 25 | N_ITERS = 200
 26 | EPSILON = 0.1
 27 | ALPHA = 0.1
 28 | INITIAL_VALUE = 0.
 29 | 
 30 | 
 31 | def save_frame(frame, filename):
 32 |     return frame.to_pickle(
 33 |             os.path.join(
 34 |                     c.Paths.output,
 35 |                     'ex_2_5',
 36 |                     filename
 37 |             )
 38 |     )
 39 | 
 40 | 
 41 | def new_agent():
 42 |     return ActionValueBanditAgent(
 43 |             estimators=[
 44 |                 SampleAverageEstimator(INITIAL_VALUE)
 45 |                 # ExponentialRecencyWeightedEstimator(
 46 |                 #         alpha=ALPHA,
 47 |                 #         initial_value=INITIAL_VALUE
 48 |                 # )
 49 |                 for _ in range(N_BANDITS)
 50 |             ],
 51 |             actor=EpsilonGreedyActor(
 52 |                     epsilon=EPSILON,
 53 |                     n_actions=N_BANDITS,
 54 |                     random_state=AGENT_RANDOM_STATE
 55 |             )
 56 |     )
 57 | 
 58 | 
 59 | if __name__ == '__main__':
 60 |     EstimatorType = SampleAverageEstimator
 61 | 
 62 |     sampler = utils.RandomWalkingValueSampler(
 63 |             n_steps=N_STEPS,
 64 |             n_bandits=N_BANDITS,
 65 |             loc=0.,
 66 |             scale=0.01,
 67 |             random_state=np.random.RandomState(seed=0)
 68 |     )
 69 | 
 70 |     all_choices = list()
 71 |     all_explore = list()
 72 |     all_optimal = list()
 73 |     results = list()
 74 | 
 75 |     with ProcessPoolExecutor(4) as executor:
 76 |         for i in range(N_ITERS):
 77 |             print('Submitting', i)
 78 | 
 79 |             agent = new_agent()
 80 |             samples = sampler.sample(initial_values=np.zeros(N_BANDITS))
 81 |             results.append(executor.submit(utils.run_single, agent, samples))
 82 | 
 83 |     print('waiting on results')
 84 |     for future in results:
 85 |         output = future.result()
 86 |         all_choices.append(output.choices)
 87 |         all_explore.append(output.explore)
 88 |         all_optimal.append(output.optimal)
 89 | 
 90 |     all_choices = pd.DataFrame(np.c_[all_choices].T)
 91 |     all_explore = pd.DataFrame(np.c_[all_explore].T)
 92 |     all_optimal = pd.DataFrame(np.c_[all_optimal].T)
 93 | 
 94 |     save_frame(
 95 |             all_choices,
 96 |             r'choices_{}_eps{}.pkl'.format(
 97 |                     EstimatorType.__name__.lower(),
 98 |                     EPSILON
 99 |             )
100 |     )
101 | 
102 |     save_frame(
103 |             all_explore,
104 |             r'explore_{}_eps{}.pkl'.format(
105 |                     EstimatorType.__name__.lower(),
106 |                     EPSILON
107 |             )
108 |     )
109 | 
110 |     save_frame(all_optimal, r'optimal.pkl')
111 | 


--------------------------------------------------------------------------------
/code/exercises/ex_4_7/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 16/05/2018 16:19
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import os
10 | 
11 | import constants as c
12 | 
13 | output_folder = os.path.join(c.Paths.output, 'ex_4_7')
14 | 


--------------------------------------------------------------------------------
/code/exercises/ex_4_7/analysis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 24/05/2018 20:50
 6 | ---------------------------------
 7 | 
 8 | """
 9 | from itertools import product
10 | import os
11 | 
12 | import matplotlib; matplotlib.use('TkAgg')
13 | import matplotlib.pyplot as plt
14 | import numpy as np
15 | 
16 | from plotting import rc
17 | from exercises.utils import read_pickle
18 | from exercises.ex_4_7 import output_folder
19 | 
20 | if __name__ == "__main__":
21 |     plt.rcParams.update(rc())
22 |     plt.rcParams.update({'figure.figsize': (15, 8)})
23 |     policy = read_pickle(os.path.join(output_folder, 'policy.pkl'))
24 |     values = read_pickle(os.path.join(output_folder, 'values.pkl'))
25 | 
26 |     max_cars = values.shape[0]
27 | 
28 |     fig = plt.figure()
29 |     ax = fig.add_subplot(121)
30 |     lim = np.max(np.abs(policy))
31 |     ax.matshow(policy.T, cmap=plt.cm.bwr, vmin=-lim, vmax=lim)
32 |     ax.set_xticks(range(max_cars))
33 |     ax.set_yticks(range(max_cars))
34 |     ax.xaxis.set_ticks_position('none')
35 |     ax.yaxis.set_ticks_position('none')
36 |     ax.set_xlabel("Cars at location x")
37 |     ax.set_ylabel("Cars at location y")
38 |     ax.set_xticks([x - 0.5 for x in range(1, max_cars)], minor=True)
39 |     ax.set_yticks([y - 0.5 for y in range(1, max_cars)], minor=True)
40 |     for x, y in product(range(max_cars), range(max_cars)):
41 |         ax.text(x=x, y=y, s=int(policy[x, y]), va='center', ha='center', fontsize=8)
42 |     ax.set_title(r'$\pi_*$', fontsize=20)
43 | 
44 |     x, y = zip(*product(range(max_cars), repeat=2))
45 |     surface = [values[i, j] for i, j in zip(x, y)]
46 |     ax = fig.add_subplot(122, projection='3d')
47 |     ax.scatter3D(x, y, surface)
48 |     ax.set_xlim3d(0, max_cars)
49 |     ax.set_ylim3d(0, max_cars)
50 |     ax.set_xlabel("Cars at location x")
51 |     ax.set_ylabel("Cars at location y")
52 |     ax.set_title('$v_*$', fontsize=20)
53 | 
54 |     plt.savefig(
55 |             os.path.join(output_folder, 'altered_car_rental.png'),
56 |             bbox_inches='tight'
57 |     )
58 |     # plt.show()
59 | 


--------------------------------------------------------------------------------
/code/exercises/ex_4_7/jacks_car_rental/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 24/05/2018 20:51
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import os
10 | 
11 | import constants as c
12 | 
13 | folder = os.path.join(
14 |             c.Paths.data,
15 |             "exercise_output",
16 |             "ex_4_7",
17 |             "jacks_car_rental"
18 |     )
19 | 


--------------------------------------------------------------------------------
/code/exercises/ex_4_7/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 25/05/2018 23:12
 6 | ---------------------------------
 7 | 
 8 | """
 9 | 
10 | from exercises.ex_4_7.jacks_car_rental.model import (
11 |     Environment,
12 |     Distributions,
13 |     policy_iteration,
14 |     initial_policy_values
15 | )
16 | 
17 | CARS_MOVED_FREE = 1
18 | STORAGE_COST = 4
19 | STORAGE_THRESHOLD = 10
20 | 
21 | 
22 | class AlteredEnvironment(Environment):
23 |     def expected_reward(self, state, action):
24 |         x, y = state
25 |         rental_reward = self.rental_reward * (
26 |                 self.expected_rentals(x - action, self.distributions.x_rental)
27 |                 + self.expected_rentals(y + action, self.distributions.y_rental)
28 |         )
29 | 
30 |         overnight_charge = STORAGE_COST if x - action > STORAGE_THRESHOLD or y + action > STORAGE_THRESHOLD else 0.
31 | 
32 |         return rental_reward - self.movement_cost * max(abs(action) - CARS_MOVED_FREE, 0) - overnight_charge
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     import os
37 |     import pickle
38 |     import time
39 | 
40 |     from exercises.ex_4_7 import output_folder
41 | 
42 |     MAX_CARS = 20
43 |     MAX_CAR_MOVES = 5
44 |     DISCOUNT_FACTOR = 0.9
45 |     THRESHOLD = 1e-2
46 |     MAXITER = 10**4
47 | 
48 |     RENTAL_REWARD = 10
49 |     MOVEMENT_COST = 2
50 | 
51 |     print("Building environment")
52 |     start = time.time()
53 |     environment = AlteredEnvironment(
54 |             distributions=Distributions(),
55 |             max_cars=MAX_CARS,
56 |             max_car_moves=MAX_CAR_MOVES,
57 |             rental_reward=RENTAL_REWARD,
58 |             movement_cost=MOVEMENT_COST,
59 |             populate_dynamics=True
60 |     )
61 | 
62 |     print("Building environment took {:.2f}".format(time.time() - start))
63 | 
64 |     # with open(os.path.join(output_folder, f'environment_max_cars_{MAX_CARS}_max_moves_{MAX_CAR_MOVES}.pkl'), 'wb') as f:
65 |     #     pickle.dump(environment, f)
66 | 
67 |     initial_policy, initial_values = initial_policy_values(environment)
68 | 
69 |     policy, values = policy_iteration(
70 |             initial_policy,
71 |             initial_values,
72 |             environment,
73 |             maxiter=MAXITER,
74 |             tolerance=THRESHOLD,
75 |             discount_factor=DISCOUNT_FACTOR
76 |     )
77 | 
78 |     print(policy)
79 |     print(values)
80 | 
81 |     with open(os.path.join(output_folder, 'policy.pkl'), 'wb') as f:
82 |         pickle.dump(policy, f)
83 | 
84 |     with open(os.path.join(output_folder, 'values.pkl'), 'wb') as f:
85 |         pickle.dump(values, f)
86 | 


--------------------------------------------------------------------------------
/code/exercises/ex_4_9/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 31/05/2018 16:52
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_4_9/gamblers_problem.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 31/05/2018 16:52
  6 | ---------------------------------
  7 | 
  8 | """
  9 | import numpy as np
 10 | import matplotlib;
 11 | 
 12 | matplotlib.use('TkAgg')
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | 
 16 | # Made the greedy policy not able to select 0 as an action to get a non-trivial policy
 17 | 
 18 | 
 19 | class Environment:
 20 |     def __init__(self, p_win, winning_capital=100):
 21 |         self.p_win = p_win
 22 |         self.p_lose = 1 - p_win
 23 |         self.winning_capital = winning_capital
 24 |         self.possible_states = range(1, winning_capital)
 25 | 
 26 |     def possible_actions(self, state):
 27 |         return range(min(state, self.winning_capital - state + 1))
 28 | 
 29 |     def single_value_update(self, action, state, values):
 30 |         out = self.p_win * values[state + action] + self.p_lose * values[state - action]
 31 |         # if state + action >= self.winning_capital:
 32 |         #     out += self.p_win
 33 |         return out
 34 | 
 35 | 
 36 | def value_update(state, values, environment):
 37 |     return np.max(
 38 |             [environment.single_value_update(a, state, values) for a in environment.possible_actions(state)]
 39 |     )
 40 | 
 41 | 
 42 | def value_iteration(values, environment, tolerance, verbose=False):
 43 |     delta = tolerance
 44 |     sweeps = 0
 45 |     vals = [values]
 46 |     while delta >= tolerance:
 47 |         old_values = values
 48 |         values = [value_update(s, old_values, environment) for s in environment.possible_states]
 49 |         values.append(1)
 50 |         values.insert(0, 0)
 51 |         values = np.array(values)
 52 |         delta = np.max(np.abs(old_values - values))
 53 |         sweeps += 1
 54 |         vals.append(values)
 55 |         if verbose:
 56 |             print(f"End of sweep {sweeps} delta = {delta}")
 57 |     return vals, greedy_policy(values, environment)
 58 | 
 59 | 
 60 | def action_values(state, values, environment):
 61 |     return [environment.single_value_update(a, state, values) for a in environment.possible_actions(state)]
 62 | 
 63 | 
 64 | def greedy_action(state, values, environment):
 65 |     avs = action_values(state, values, environment)
 66 |     avs[0] = 0
 67 |     return np.argmax(avs)
 68 | 
 69 | 
 70 | def greedy_policy(values, environment):
 71 |     return np.array([
 72 |         greedy_action(state, values, environment) for state in environment.possible_states
 73 |     ])
 74 | 
 75 | 
 76 | def initial_values(environment):
 77 |     values = np.zeros(len(environment.possible_states) + 2)
 78 |     values[0] = 0
 79 |     values[-1] = 1
 80 |     return values
 81 | 
 82 | 
 83 | if __name__ == "__main__":
 84 |     import os
 85 | 
 86 |     import constants as c
 87 |     import plotting
 88 | 
 89 | 
 90 |     def plot(values_list, policy, i, p_win, name=None, legend=True):
 91 |         with plt.rc_context(plotting.rc()):
 92 |             fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
 93 |             for sweep, v in enumerate(values_list, start=1):
 94 |                 ax1.plot(v, label=sweep, lw=0.5)
 95 | 
 96 |             for ax in ax1, ax2:
 97 |                 ax.grid(alpha=0.1, ls=':')
 98 | 
 99 |             if legend:
100 |                 ax1.legend(title="Sweep", bbox_to_anchor=(1, 1))
101 |             ax1.set_title(
102 |                     fr"Optimal Values: $\theta=10^{{{-i}}}$",
103 |                     x=0.05,
104 |                     y=0.95,
105 |                     ha='left',
106 |                     va='top',
107 |                     fontsize=10
108 |             )
109 |             ax2.plot(policy)
110 |             ax2.set_title(
111 |                     fr"Optimal Policy: $\theta=10^{{{-i}}}$",
112 |                     x=0.05,
113 |                     y=0.95,
114 |                     ha='left',
115 |                     va='top',
116 |                     fontsize=10
117 |             )
118 | 
119 |             plt.suptitle(fr"$\mathbb{{P}}(\mathtt{{win}})={p_win}$")
120 |             if name is not None:
121 |                 plt.savefig(
122 |                         os.path.join(
123 |                                 c.Paths.output,
124 |                                 'ex_4_9',
125 |                                 name + '.eps'
126 |                         ),
127 |                         format='eps',
128 |                         dpi=1000,
129 |                         bbox_inches='tight'
130 |                 )
131 | 
132 |         return fig, (ax1, ax2)
133 | 
134 | 
135 |     i = 3
136 | 
137 |     for p_win in [0.25, 0.55]:
138 |         environment = Environment(
139 |                 p_win=p_win,
140 |                 winning_capital=100
141 |         )
142 | 
143 |         values, policy = value_iteration(
144 |                 values=initial_values(environment),
145 |                 environment=environment,
146 |                 tolerance=10 ** -i,
147 |                 verbose=True
148 |         )
149 | 
150 |         plot(
151 |             values,
152 |             policy,
153 |             name='values_and_policy_pwin_{:2.0f}'.format(100 * p_win),
154 |             i=i,
155 |             p_win=p_win,
156 |             legend=p_win < 0.5
157 |         )
158 | 
159 |     # plt.show()
160 | 


--------------------------------------------------------------------------------
/code/exercises/ex_5_10/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 06/06/2018 15:59
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_5_10/analyse.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 12/06/2018 13:41
  6 | ---------------------------------
  7 | 
  8 | """
  9 | import pickle
 10 | import os
 11 | 
 12 | from matplotlib import colors, use; use("TkAgg")
 13 | import matplotlib.pyplot as plt
 14 | import numpy as np
 15 | 
 16 | import constants as c
 17 | import plotting
 18 | 
 19 | from exercises.ex_5_10.model import RaceTrack
 20 | from exercises.ex_5_10.utils import load_track
 21 | 
 22 | 
 23 | def plot_learning_curve(returns, epsilons, track_name):
 24 |     fig, ax = plt.subplots(1)
 25 |     ax.plot(-np.array(returns), label="$-G$")
 26 |     ax.set_yscale('log')
 27 |     ax.set_ylabel(r"$-1 \times$ Returns (log scale)")
 28 |     ax.set_xlabel("Episode")
 29 | 
 30 |     ax1 = ax.twinx()
 31 |     ax1.plot(epsilons, label="$\epsilon$ (right)", color='C1')
 32 | 
 33 |     ax.set_title(f"Learning Curve: {track_name}")
 34 | 
 35 |     plotting.multi_ax_legend(ax, ax1)
 36 |     ax1.grid(alpha=0.1, which="both", ls=':')
 37 |     return fig, (ax, ax1)
 38 | 
 39 | 
 40 | def grid(racetrack, episode, format_dict):
 41 |     def replace(row, dct):
 42 |         return [dct[r] if r in dct else int(r) for r in row]
 43 | 
 44 |     s = racetrack.episode_string(episode)
 45 | 
 46 |     rows = [
 47 |         replace([i for i in k.split(racetrack.format_dict['sep']) if i], format_dict)
 48 |         for k in s.split('\n')
 49 |     ]
 50 |     return np.array(list(filter(bool, rows)))
 51 | 
 52 | 
 53 | def plot_trajectory(ax, racetrack, episode):
 54 |     racetrack.format_dict['track'] = "aa"  # because track and sep are the same
 55 | 
 56 |     format_dict = {
 57 |         racetrack.format_dict['track']: -1,
 58 |         racetrack.format_dict['start']: -2,
 59 |         racetrack.format_dict['finish']: -3,
 60 |         racetrack.format_dict['border']: -4
 61 |     }
 62 | 
 63 |     e_grid = grid(episode=episode, racetrack=racetrack, format_dict=format_dict)
 64 | 
 65 |     cs = ['gray', 'C2', 'C3', 'white']
 66 |     bounds = list(range(-4, 0))
 67 |     for i in range(len(episode.states)):
 68 |         cs.append('C0')
 69 |         bounds.append(i)
 70 | 
 71 |     cmap = colors.ListedColormap(cs)
 72 |     norm = colors.BoundaryNorm(bounds, cmap.N)
 73 | 
 74 |     ax.imshow(e_grid, cmap=cmap, norm=norm)
 75 | 
 76 |     def make_ticks(rng):
 77 |         return [k - 0.5 for k in rng]
 78 | 
 79 |     ax.set_xticks(make_ticks(list(range(e_grid.shape[1]))[::-1]))
 80 |     ax.set_yticks(make_ticks(range(e_grid.shape[0])))
 81 | 
 82 |     ax.grid(alpha=0.05, which='both', lw=0.1)
 83 | 
 84 |     ax.set_xticklabels([])
 85 |     ax.set_yticklabels([])
 86 |     ax.tick_params(length=0, width=0)
 87 |     return None
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     TRACK_NAME = "track_2"
 92 | 
 93 |     with open(os.path.join(c.Paths.output, 'ex_5_10', f'{TRACK_NAME}.pkl'), 'rb') as f:
 94 |         info = pickle.load(f)
 95 | 
 96 |     fig, axarr = plot_learning_curve(
 97 |             info['returns'],
 98 |             info['training_epsilons'],
 99 |             track_name=TRACK_NAME.replace('_', ' ').title()
100 |     )
101 | 
102 |     fig.savefig(
103 |             os.path.join(
104 |                     c.Paths.output,
105 |                     "ex_5_10",
106 |                     f"{TRACK_NAME}_learning_curve.eps"
107 |             ),
108 |             format='eps',
109 |             dpi=1000
110 |     )
111 | 
112 |     track_points = load_track(
113 |             os.path.join(
114 |                     c.Paths.input,
115 |                     'ex_5_10',
116 |                     f"{TRACK_NAME}.csv"
117 |             )
118 |     )
119 | 
120 |     racetrack = RaceTrack(
121 |             **track_points
122 |     )
123 | 
124 |     # with plt.rc_context(plotting.rc()):
125 |     #     fig, axarr = plt.subplots(2, 3)
126 |     #     for (p, e), ax in zip(info['greedy_episodes'].items(), axarr.flatten()):
127 |     #         plot_trajectory(ax, racetrack, e)
128 |     #         ax.set_title(f"Start: {p}. Return {sum(e.rewards)}", fontsize=10)
129 |     #     fig.suptitle(f"{TRACK_NAME.replace('_', ' ').title()} Trajectories", fontsize=16)
130 |     #     fig.savefig(
131 |     #             os.path.join(
132 |     #                     c.Paths.output,
133 |     #                     "ex_5_10",
134 |     #                     f"{TRACK_NAME}_trajectories.eps"
135 |     #             ),
136 |     #             format='eps',
137 |     #             dpi=1000
138 |     #     )
139 | 
140 |     with plt.rc_context(plotting.rc()):
141 |         fig, ax = plt.subplots(1)
142 |         p = (0, 3)
143 |         e = info['greedy_episodes'][p]
144 |         plot_trajectory(ax, racetrack, e)
145 |         ax.set_title(f"Start: {p}. Return {sum(e.rewards)}", fontsize=10)
146 |         fig.suptitle(f"{TRACK_NAME.replace('_', ' ').title()} Sample Trajectory", fontsize=16)
147 |         fig.savefig(
148 |                 os.path.join(
149 |                         c.Paths.output,
150 |                         "ex_5_10",
151 |                         f"{TRACK_NAME}_sample_trajectory.eps"
152 |                 ),
153 |                 format='eps',
154 |                 dpi=1000
155 |         )
156 | 
157 |         # plt.show()
158 | 


--------------------------------------------------------------------------------
/code/exercises/ex_5_10/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 11/06/2018 18:00
  6 | ---------------------------------
  7 | 
  8 | I added a crash penalty of -100 to deter the agent from
  9 | running into walls to end up at favourable starting positions.
 10 | 
 11 | Also, the algorithm doesn't seem to converge well with the noise on.
 12 | 
 13 | """
 14 | import pickle
 15 | import os
 16 | 
 17 | import numpy as np
 18 | 
 19 | import constants as c
 20 | 
 21 | from exercises.ex_5_10.model import Brain, Car, RaceTrack, run_episode, train
 22 | from exercises.ex_5_10.utils import load_track
 23 | 
 24 | 
 25 | if __name__ == "__main__":
 26 | 
 27 |     TRACK_NAME = "track_2"
 28 |     eps_start = 0.1
 29 | 
 30 |     folder = os.path.join(
 31 |             c.Paths.input,
 32 |             'ex_5_10'
 33 |     )
 34 | 
 35 |     track_indices = load_track(
 36 |             path=os.path.join(folder, f"{TRACK_NAME}.csv"),
 37 |             track_flag=0,
 38 |             start_flag=2,
 39 |             finish_flag=3
 40 |     )
 41 | 
 42 |     racetrack = RaceTrack(
 43 |             noise_level=None, #0.1,
 44 |             crash_penalty=-1000,
 45 |             **track_indices
 46 |     )
 47 |     car = Car(None, 5, 1)
 48 | 
 49 |     brain = Brain(
 50 |             car,
 51 |             racetrack,
 52 |             epsilon=eps_start,
 53 |             random_state=np.random.RandomState(seed=123)
 54 |     )
 55 | 
 56 |     print(racetrack)
 57 | 
 58 |     # initialise the policy with random runs
 59 |     brain.epsilon = 1.
 60 |     for i in range(3):
 61 |         car.set_policy(
 62 |                 brain.epsilon_greedy_policy()
 63 |         )
 64 |         g = train(brain, car, racetrack)
 65 |         print("------------------------------------------------------")
 66 |         print(f"Finished random policy episode set {i}")
 67 |         print(f"Epsilon = {brain.epsilon}")
 68 |         print(f"Average Return: {g}")
 69 |         print("------------------------------------------------------")
 70 |         print("\n")
 71 | 
 72 |     brain.epsilon = eps_start
 73 |     returns = list()
 74 |     training_epsilons = list()
 75 |     n_runs = 20
 76 |     for i in range(n_runs):
 77 |         car.set_policy(
 78 |                 brain.epsilon_greedy_policy()
 79 |         )
 80 |         g = train(brain, car, racetrack)
 81 |         returns.append(g)
 82 |         training_epsilons.append(brain.epsilon)
 83 |         print("------------------------------------------------------")
 84 |         print(f"Finished episode set {i}")
 85 |         print(f"Epsilon = {brain.epsilon}")
 86 |         print(f"Average Return: {g}")
 87 |         print("------------------------------------------------------")
 88 |         print("\n")
 89 |         # brain.epsilon -= eps_start / n_runs
 90 | 
 91 |     greedy_episodes = dict()
 92 |     print("\n")
 93 |     racetrack.set_noise_level(None)
 94 |     car.set_policy(brain.greedy_policy())
 95 |     for pos in racetrack.start_positions:
 96 |         greedy_episode = run_episode(
 97 |                 car,
 98 |                 racetrack,
 99 |                 start_position=pos
100 |         )
101 |         print(f"Greedy Episode: starting at {pos}")
102 |         print(f"Return: {sum(greedy_episode.rewards)}")
103 |         racetrack.print_episode(greedy_episode)
104 |         greedy_episodes[pos] = greedy_episode
105 | 
106 |     info = dict(
107 |             track_name=TRACK_NAME,
108 |             returns=returns,
109 |             training_epsilons=training_epsilons,
110 |             greedy_episodes=greedy_episodes
111 |     )
112 | 
113 |     with open(os.path.join(c.Paths.output, 'ex_5_10', f'{TRACK_NAME}.pkl'), 'wb') as f:
114 |         pickle.dump(info, f)
115 | 
116 | 


--------------------------------------------------------------------------------
/code/exercises/ex_5_10/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 12/06/2018 13:38
 6 | ---------------------------------
 7 | 
 8 | """
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def load_track(path, track_flag=0, start_flag=2, finish_flag=3, delimiter=',', dtype=int):
14 |     arr = np.genfromtxt(
15 |             path,
16 |             delimiter=delimiter,
17 |             dtype=dtype
18 |     )
19 |     return parse_track(
20 |             arr,
21 |             track_flag=track_flag,
22 |             start_flag=start_flag,
23 |             finish_flag=finish_flag
24 |     )
25 | 
26 | 
27 | def parse_track(arr, track_flag, start_flag, finish_flag):
28 |     track_positions = get_locs(arr, track_flag)
29 |     start_positions = sorted(get_locs(arr, start_flag))
30 |     finish_positions = sorted(get_locs(arr, finish_flag))
31 | 
32 |     track_positions.extend(start_positions)
33 |     track_positions.extend(finish_positions)
34 |     return dict(
35 |             track_positions=sorted(track_positions),
36 |             start_positions=start_positions,
37 |             finish_positions=finish_positions,
38 |     )
39 | 
40 | 
41 | def get_locs(arr, value):
42 |     x, y = np.where(arr == value)
43 |     x = arr.shape[0] - 1 - x
44 |     return list(zip(x, y))


--------------------------------------------------------------------------------
/code/exercises/ex_6_10/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 27/06/2018 12:53
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_6_10/stochastic_windy_gridworld.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 27/06/2018 12:53
  6 | ---------------------------------
  7 | 
  8 | """
  9 | import os
 10 | 
 11 | import matplotlib; matplotlib.use("TkAgg")
 12 | import matplotlib.pyplot as plt
 13 | import numpy as np
 14 | 
 15 | import constants as c
 16 | from generic.policies import EpsilonGreedyPolicy, GreedyPolicy
 17 | 
 18 | from exercises.ex_6_9.windy_gridworld import WindyGridWorld, initial_action_values
 19 | 
 20 | 
 21 | class StochasticWindyGridWorld(WindyGridWorld):
 22 |     def __init__(self, wind_grid, start_position, goal_position, random_state=None):
 23 |         super().__init__(
 24 |                 wind_grid=wind_grid,
 25 |                 start_position=start_position,
 26 |                 goal_position=goal_position
 27 |         )
 28 |         self.random_state = random_state or np.random.RandomState(seed=0)
 29 | 
 30 |     def get_wind(self, position):
 31 |         return super().get_wind(position) + self.random_state.choice(np.arange(3)) - 1
 32 | 
 33 | 
 34 | def plot_learning_curve(episodes, ax, **kwargs):
 35 |     training_steps = 0
 36 |     y = []
 37 |     x = []
 38 |     for i, episode in enumerate(episodes):
 39 |         training_steps += len(episode.rewards)
 40 |         y.append(i)
 41 |         x.append(training_steps)
 42 |     return ax.plot(x, y, **kwargs)
 43 | 
 44 | 
 45 | def learning_curve_chart(episodes):
 46 |     """make chart like in the book"""
 47 |     fig, ax = plt.subplots(1)
 48 |     plot_learning_curve(episodes, ax=ax)
 49 |     ax.grid(alpha=0.1)
 50 | 
 51 |     ax.set_ylabel("Episode")
 52 |     ax.set_xlabel("Total Training Steps")
 53 |     ax.set_title("Learning Curve", fontsize=14)
 54 |     return fig, ax
 55 | 
 56 | 
 57 | if __name__ == "__main__":
 58 |     n_episodes = 5000
 59 |     alphas = np.linspace(0.5, 0., n_episodes)
 60 |     gamma = 1.
 61 | 
 62 |     possible_actions = [
 63 |         (-1, -1),
 64 |         (-1, 0),
 65 |         (-1, 1),
 66 |         (0, -1),
 67 |         # (0, 0),
 68 |         (0, 1),
 69 |         (1, -1),
 70 |         (1, 0),
 71 |         (1, 1)
 72 |     ]
 73 | 
 74 |     gridworld = StochasticWindyGridWorld(
 75 |             wind_grid=np.loadtxt(
 76 |                     os.path.join(
 77 |                             c.Paths.input,
 78 |                             'ex_6_9',
 79 |                             'gridworld.csv'
 80 |                     ),
 81 |                     dtype=int
 82 |             ),
 83 |             start_position=(3, 0),
 84 |             goal_position=(3, 7)
 85 |     )
 86 | 
 87 |     av = initial_action_values(.5, gridworld, possible_actions)
 88 |     epsilon_greedy_policy = EpsilonGreedyPolicy(
 89 |             action_values=av,
 90 |             epsilon=0.1
 91 |     )
 92 | 
 93 |     training_episodes = list()
 94 |     total_training_steps = 0
 95 |     for i, a in enumerate(alphas):
 96 |         e = gridworld.run_episode(
 97 |                 epsilon_greedy_policy,
 98 |                 alpha=a,
 99 |                 gamma=gamma,
100 |                 train_policy=True
101 |         )
102 |         training_episodes.append(e)
103 | 
104 |         steps = len(e.rewards)
105 |         total_training_steps += steps
106 |         print(f"Episode {i}: {steps} steps")
107 |     print(f"Total Training Steps: {total_training_steps}")
108 | 
109 |     greedy_policy = GreedyPolicy(epsilon_greedy_policy.action_values)
110 |     greedy_episode = gridworld.run_episode(
111 |             greedy_policy,
112 |             alpha=0.,
113 |             gamma=gamma,
114 |             train_policy=False
115 |     )
116 | 
117 |     print("\n\n\n\n\n")
118 |     print(f"Greedy Episode: {len(greedy_episode.rewards)} steps")
119 |     gridworld.print_episode(greedy_episode)
120 | 
121 |     fig, ax = learning_curve_chart(training_episodes)
122 | 
123 |     fig.savefig(
124 |             os.path.join(
125 |                     c.Paths.output,
126 |                     'ex_6_10',
127 |                     'learning_curve.eps'
128 |             ),
129 |             dpi=1000,
130 |             format="eps"
131 |     )
132 | 
133 | 


--------------------------------------------------------------------------------
/code/exercises/ex_6_9/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 26/06/2018 13:02
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_6_9/windy_gridworld.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 26/06/2018 13:03
  6 | ---------------------------------
  7 | 
  8 | """
  9 | from collections import namedtuple
 10 | import os
 11 | 
 12 | import numpy as np
 13 | 
 14 | import constants as c
 15 | from generic.policies import EpsilonGreedyPolicy, GreedyPolicy
 16 | from generic import updates
 17 | 
 18 | Episode = namedtuple('Episode', ['states', 'actions', 'rewards'])
 19 | 
 20 | 
 21 | def initial_action_values(v, gridworld, possible_actions):
 22 |     return {
 23 |         s: {a: v if s != gridworld.goal_position else 0. for a in possible_actions} for s in gridworld.possible_states()
 24 |     }
 25 | 
 26 | 
 27 | class WindyGridWorld:
 28 |     def __init__(self, wind_grid, start_position, goal_position):
 29 |         self.wind_grid = wind_grid
 30 |         self.start_position = start_position
 31 |         self.goal_position = goal_position
 32 | 
 33 |     def possible_states(self):
 34 |         return np.ndindex(*self.wind_grid.shape)
 35 | 
 36 |     def run_episode(self, policy, alpha, gamma=1., train_policy=False):
 37 |         position = self.start_position
 38 |         action = policy(position)
 39 | 
 40 |         states = []
 41 |         actions = []
 42 |         rewards = []
 43 | 
 44 |         while position != self.goal_position:
 45 |             new_position = self.move(position, action)
 46 |             new_action = policy(new_position)
 47 |             reward = self.reward(new_position)
 48 | 
 49 |             if train_policy:
 50 |                 updates.sarsa(
 51 |                         action_values=policy.action_values,
 52 |                         old_state=position,
 53 |                         action=action,
 54 |                         reward=reward,
 55 |                         new_state=new_position,
 56 |                         new_action=new_action,
 57 |                         alpha=alpha,
 58 |                         gamma=gamma
 59 |                 )
 60 | 
 61 |             states.append(position)
 62 |             actions.append(action)
 63 |             rewards.append(reward)
 64 | 
 65 |             position = new_position
 66 |             action = new_action
 67 | 
 68 |         return Episode(
 69 |                 states=states,
 70 |                 actions=actions,
 71 |                 rewards=rewards
 72 |         )
 73 | 
 74 |     def reward(self, new_position):
 75 |         return 0 if new_position == self.goal_position else -1
 76 | 
 77 |     def _clip_to_grid(self, position):
 78 |         x, y = position
 79 |         return (
 80 |             min(max(x, 0), self.wind_grid.shape[0] - 1),
 81 |             min(max(y, 0), self.wind_grid.shape[1] - 1)
 82 |         )
 83 | 
 84 |     def get_wind(self, position):
 85 |         return self.wind_grid[position]
 86 | 
 87 |     def move(self, from_position, action):
 88 |         x, y = from_position
 89 |         dx, dy = action
 90 |         new_position = (x + dx - self.get_wind(from_position), y + dy)
 91 |         # from the diagram in the book, you can see that if you go off the grid then you
 92 |         # get your position rounded to the edge of the grid.
 93 |         return self._clip_to_grid(new_position)
 94 | 
 95 |     def _episode_string(self, episode):
 96 |         output = ""
 97 |         for i in range(self.wind_grid.shape[0]):
 98 |             for j in range(self.wind_grid.shape[1]):
 99 |                 pos = i, j
100 |                 if pos == self.start_position:
101 |                     output += "s"
102 |                 elif pos == self.goal_position:
103 |                     output += "f"
104 |                 elif pos in episode.states:
105 |                     output += "*"
106 |                 else:
107 |                     output += str(self.wind_grid[pos])
108 |                 output += " "
109 |             output += "\n"
110 |         return output
111 | 
112 |     def print_episode(self, episode):
113 |         return print(self._episode_string(episode))
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     n_episodes = 500
118 |     alphas = np.linspace(0.5, 0., n_episodes)
119 |     gamma = 1.
120 | 
121 |     # this reproduces the textbook example.
122 |     # possible_actions = [
123 |     #     (-1, 0),
124 |     #     (0, -1),
125 |     #     (0, 1),
126 |     #     (1, 0),
127 |     # ]
128 | 
129 |     possible_actions = [
130 |         (-1, -1),
131 |         (-1, 0),
132 |         (-1, 1),
133 |         (0, -1),
134 |         # (0, 0),
135 |         (0, 1),
136 |         (1, -1),
137 |         (1, 0),
138 |         (1, 1)
139 |     ]
140 | 
141 |     gridworld = WindyGridWorld(
142 |             wind_grid=np.loadtxt(
143 |                     os.path.join(
144 |                             c.Paths.input,
145 |                             'ex_6_9',
146 |                             'gridworld.csv'
147 |                     ),
148 |                     dtype=int
149 |             ),
150 |             start_position=(3, 0),
151 |             goal_position=(3, 7)
152 |     )
153 | 
154 |     av = initial_action_values(.5, gridworld, possible_actions)
155 |     epsilon_greedy_policy = EpsilonGreedyPolicy(
156 |             action_values=av,
157 |             epsilon=0.1
158 |     )
159 | 
160 |     episodes = list()
161 |     total_training_steps = 0
162 |     for i, a in enumerate(alphas):
163 |         e = gridworld.run_episode(
164 |                 epsilon_greedy_policy,
165 |                 alpha=a,
166 |                 gamma=gamma,
167 |                 train_policy=True
168 |         )
169 |         episodes.append(e)
170 | 
171 |         steps = len(e.rewards)
172 |         total_training_steps += steps
173 |         print(f"Episode {i}: {steps} steps")
174 |     print(f"Total Training Steps: {total_training_steps}")
175 | 
176 |     greedy_policy = GreedyPolicy(epsilon_greedy_policy.action_values)
177 |     greedy_episode = gridworld.run_episode(
178 |             greedy_policy,
179 |             alpha=0.,
180 |             gamma=gamma,
181 |             train_policy=False
182 |     )
183 | 
184 |     print("\n\n\n\n\n")
185 |     print(f"Greedy Episode: {len(greedy_episode.rewards)} steps")
186 |     gridworld.print_episode(greedy_episode)
187 | 


--------------------------------------------------------------------------------
/code/exercises/ex_7_2/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 02/07/2018 14:00
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_7_2/comparison.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 02/07/2018 14:00
  6 | ---------------------------------
  7 | 
  8 | 
  9 | We set \gamma = 1 in this example: **which makes the two methods equivalent**
 10 | 
 11 | Note: the methods are programmed in a way that is specific to this problem
 12 | 
 13 | """
 14 | from collections import namedtuple
 15 | 
 16 | import numpy as np
 17 | 
 18 | Episode = namedtuple('Episode', ['states', 'reward'])
 19 | 
 20 | 
 21 | class RandomWalkEnvironment:
 22 |     def __init__(self, starting_state, random_state):
 23 |         self.starting_state = starting_state
 24 |         self.random_state = random_state
 25 | 
 26 |         self.current_state = starting_state
 27 |         self.n_states = 2 * starting_state - 1
 28 |         self.states = list(range(1, self.n_states + 1))
 29 |         self.terminal_states = (self.states[0] - 1, self.states[-1] + 1)
 30 | 
 31 |         self._rewards = dict(zip(self.terminal_states, [0, 1]))
 32 | 
 33 |     def next(self):
 34 |         if self.current_state in self.terminal_states:
 35 |             raise StopIteration()
 36 |         self.current_state += self.random_state.binomial(1, 0.5) * 2 - 1
 37 |         return self.current_state, self.reward(self.current_state)
 38 | 
 39 |     def reset(self):
 40 |         self.current_state = self.starting_state
 41 |         return None
 42 | 
 43 |     def reward(self, state):
 44 |         return self._rewards[state] if state in self.terminal_states else 0
 45 | 
 46 |     def true_state_values(self):
 47 |         return {s: s / (self.n_states + 1) for s in self.states}
 48 | 
 49 |     def terminated(self):
 50 |         return self.current_state in self.terminal_states
 51 | 
 52 |     def generate_episode(self, max_steps=10 ** 5):
 53 |         self.reset()
 54 | 
 55 |         i = 1
 56 |         states = [self.current_state]
 57 |         while not self.terminated():
 58 |             environment.next()
 59 |             states.append(self.current_state)
 60 |             i += 1
 61 |             if i >= max_steps:
 62 |                 break
 63 |         return Episode(
 64 |                 states=states,
 65 |                 reward=self.reward(self.current_state)
 66 |         )
 67 | 
 68 | 
 69 | def initial_values(environment, v=1.):
 70 |     vals = {
 71 |         s: v for s in environment.states
 72 |     }
 73 |     vals.update({s: 0. for s in environment.terminal_states})
 74 |     return vals
 75 | 
 76 | 
 77 | def rms_error(estimate, truth):
 78 |     return np.sqrt(np.mean([(estimate[s] - truth[s]) ** 2 for s in truth]))
 79 | 
 80 | 
 81 | class TDNPredictor:
 82 |     """specific to this problem"""
 83 | 
 84 |     def __init__(self, n, alpha, environment):
 85 |         self.n = n
 86 |         self.alpha = alpha
 87 |         self.environment = environment
 88 | 
 89 |         self.true_state_values = environment.true_state_values()
 90 |         self.state_values = None
 91 |         self.reset()
 92 | 
 93 |     def set_alpha(self, a):
 94 |         self.alpha = a
 95 |         return None
 96 | 
 97 |     def reset(self):
 98 |         self.state_values = initial_values(self.environment, v=0.)
 99 |         return None
100 | 
101 |     def update(self, t, terminal_time, episode):
102 |         target = episode.reward * (t + self.n >= terminal_time)
103 |         if t + self.n < terminal_time:
104 |             target += self.state_values[episode.states[t + self.n]]
105 |         return target - self.state_values[episode.states[t]]
106 | 
107 |     def process_episode(self, episode):
108 |         terminal_time = len(episode.states) - 1
109 |         for t, s in enumerate(episode.states[:-1]):
110 |             self.state_values[s] += self.alpha * (
111 |                 self.update(t, terminal_time, episode)
112 |             )
113 |         return rms_error(self.state_values, self.true_state_values)
114 | 
115 | 
116 | class TDErrorPredictor(TDNPredictor):
117 | 
118 |     def td_error(self, t, terminal_time, episode):
119 |         out = self.state_values[episode.states[t + 1]] - self.state_values[episode.states[t]]
120 |         return out + episode.reward if t + 1 == terminal_time else out
121 | 
122 |     def update(self, t, terminal_time, episode):
123 |         # if t == terminal_time:
124 |         #     return episode.reward - self.state_values[episode.states[t]]
125 |         n_left = min(terminal_time - t, self.n)
126 |         return sum(self.td_error(t + k, terminal_time, episode) for k in range(n_left))
127 | 
128 | 
129 | def dict_average(dicts):
130 |     return {
131 |         s: np.mean([d[s] for d in dicts]) for s in dicts[0]
132 |     }
133 | 
134 | 
135 | def test_predictor(predictor, n_runs=1000, n_episodes_per_run=100, alpha_0=0.5):
136 |     alphas = np.linspace(alpha_0, 0, n_episodes_per_run)
137 |     all_errors = list()
138 |     state_vals = list()
139 |     for i in range(n_runs):
140 |         predictor.reset()
141 |         errors = list()
142 |         for a in alphas:
143 |             predictor.set_alpha(a)
144 |             e = environment.generate_episode()
145 |             errors.append(predictor.process_episode(e))
146 |         all_errors.append(np.array(errors))
147 |         state_vals.append(predictor.state_values)
148 |     return dict_average(state_vals), sum(all_errors) / n_runs
149 | 
150 | 
151 | def run_pred(tup):
152 |     x, y = tup
153 |     return test_predictor(x, **y)
154 | 
155 | 
156 | def run_step_lengths(environment, predictor_cls, config):
157 |     with ProcessPoolExecutor() as executor:
158 |         results = executor.map(
159 |                 run_pred,
160 |                 [(predictor_cls(n, 0.5, environment), config) for n in range(1, environment.n_states)]
161 |         )
162 | 
163 |     vals, errors = zip(*results)
164 |     return vals, [e.T for e in errors]
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     from concurrent.futures import ProcessPoolExecutor
169 |     from pprint import pprint
170 | 
171 |     import matplotlib;
172 | 
173 |     matplotlib.use("TkAgg")
174 |     import matplotlib.pyplot as plt
175 | 
176 |     config = {
177 |         'n_runs': 100,
178 |         'n_episodes_per_run': 100,
179 |         'alpha_0': 0.5
180 |     }
181 | 
182 |     random_state = np.random.RandomState(seed=3)
183 |     environment = RandomWalkEnvironment(starting_state=3, random_state=random_state)
184 |     tdn_vals, tdn_errors = run_step_lengths(environment, TDNPredictor, config=config)
185 | 
186 |     random_state = np.random.RandomState(seed=3)
187 |     environment = RandomWalkEnvironment(starting_state=3, random_state=random_state)
188 |     tderror_vals, tderror_errors = run_step_lengths(environment, TDErrorPredictor, config=config)
189 | 
190 |     for i, d in enumerate(tdn_vals):
191 |         print(i)
192 |         pprint(
193 |                 {s: d[s] - tderror_vals[i][s] for s in d}
194 |         )
195 |     pprint(tdn_vals)
196 |     pprint(tderror_vals)
197 | 
198 |     for i, er in enumerate(tdn_errors, start=1):
199 |         plt.plot(er, label=f"TDN: n={i}", color=f"C{i}")
200 | 
201 |     for i, er in enumerate(tderror_errors, start=1):
202 |         plt.plot(er, label=f"TD Error: n={i}", color=f"C{i}", ls="--")
203 | 
204 |     plt.legend(ncol=2)
205 |     plt.show()
206 | 


--------------------------------------------------------------------------------
/code/exercises/ex_8_4/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 09/07/2018 16:08
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/ex_8_4/dynaq_gridworld_comparison.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 09/07/2018 16:08
  6 | ---------------------------------
  7 | 
  8 | 
  9 | I changed the example to give a negative expected_reward for each timestep because this means that I don't
 10 | have to implement random tie-breaking with max in the action-selection. It also makes convergence faster
 11 | since rewards are seen immediately.
 12 | 
 13 | """
 14 | from concurrent.futures import ProcessPoolExecutor
 15 | 
 16 | import numpy as np
 17 | 
 18 | from generic import policies, utils
 19 | from generic.agents.dyna_q import DynaQAgent, DynaQPlusAgent
 20 | from generic.environments import Gridworld
 21 | 
 22 | POSSIBLE_ACTIONS = [(0, 1), (1, 0), (-1, 0), (0, -1)]
 23 | 
 24 | 
 25 | def initial_action_values(grid_shape, possible_actions=POSSIBLE_ACTIONS, v=0.):
 26 |     return {
 27 |         s: {a: v for a in possible_actions} for s in np.ndindex(*grid_shape)
 28 |     }
 29 | 
 30 | 
 31 | def mark_trajectory(grid, states, agent_flag=2):
 32 |     g = grid.copy()
 33 |     for s in states:
 34 |         g[s] = agent_flag
 35 |     return g
 36 | 
 37 | 
 38 | def learning_curve(agent_maker, environment, n_steps, n_iters=10):
 39 |     args = [
 40 |         (agent_maker(environment.grid.shape), environment, n_steps, True)
 41 |         for _ in range(n_iters)
 42 |     ]
 43 |     with ProcessPoolExecutor() as executor:
 44 |         all_results = executor.map(
 45 |                 utils.run_continuous,
 46 |                 *zip(*args)
 47 |         )
 48 |     all_rewards = np.c_[[np.array(r.rewards) for r in all_results]].T
 49 |     return np.cumsum(all_rewards + 1, axis=0)
 50 | 
 51 | 
 52 | def run_blocking_maze(agent_maker):
 53 |     grid = blocked_grid(open='left')
 54 |     agent = agent_maker(grid.shape)
 55 | 
 56 |     environment = Gridworld(grid=grid, start_position=(5, 3), goal_position=(0, 8))
 57 |     r1 = utils.run_continuous(agent, environment, 1000, True)
 58 | 
 59 |     environment.grid = blocked_grid(open='right')
 60 |     environment.reset()
 61 |     r2 = utils.run_continuous(agent, environment, 2000, True)
 62 |     return utils.Results(
 63 |             states=r1.states + r2.states,
 64 |             actions=r1.actions + r2.actions,
 65 |             rewards=r1.rewards + r2.rewards
 66 |     )
 67 | 
 68 | 
 69 | def blocking_maze_learning_curve(agent_maker, n_iters=10):
 70 |     with ProcessPoolExecutor() as executor:
 71 |         all_results = executor.map(
 72 |                 run_blocking_maze,
 73 |                 [agent_maker for _ in range(n_iters)]
 74 |         )
 75 |     all_rewards = np.c_[[np.array(r.rewards) for r in all_results]].T
 76 |     return np.cumsum(all_rewards + 1, axis=0)
 77 | 
 78 | 
 79 | def blocked_grid(shape=(6, 9), block_row=3, open='left', block_flag=1):
 80 |     grid = np.zeros(shape)
 81 |     grid[block_row, :] = block_flag
 82 | 
 83 |     if open == 'left' or open == 'both':
 84 |         grid[block_row, 0] = 0
 85 |     if open == 'right' or open == "both":
 86 |         grid[block_row, -1] = 0
 87 | 
 88 |     return grid
 89 | 
 90 | 
 91 | if __name__ == "__main__":
 92 |     import os
 93 | 
 94 |     import matplotlib; matplotlib.use("TkAgg")
 95 |     import matplotlib.pyplot as plt
 96 | 
 97 |     import plotting
 98 |     import constants as c
 99 | 
100 | 
101 |     def dyna_q_agent_maker(grid_shape):
102 |         epsilon_greedy_policy = policies.EpsilonGreedyPolicy(
103 |                 action_values=initial_action_values(grid_shape),
104 |                 epsilon=0.1
105 |         )
106 |         return DynaQAgent(
107 |                 policy=epsilon_greedy_policy,
108 |                 alpha=0.1,
109 |                 gamma=0.95,
110 |                 n_plan_iter=50,
111 |                 random_state=np.random.RandomState(None)
112 |         )
113 | 
114 |     def dyna_q_plus_agent_maker(grid_shape):
115 |         greedy_policy = policies.GreedyPolicy(
116 |                 action_values=initial_action_values(grid_shape)
117 |         )
118 |         return DynaQPlusAgent(
119 |                 policy=greedy_policy,
120 |                 alpha=0.1,
121 |                 gamma=0.95,
122 |                 kappa=0.01,
123 |                 n_plan_iter=50,
124 |                 random_state=np.random.RandomState(None)
125 |         )
126 | 
127 |     def altered_dyna_q_plus_agent_maker(grid_shape):
128 |         policy = policies.TimeBiasedPolicy(
129 |                 action_values=initial_action_values(grid_shape=grid_shape),
130 |                 kappa=0.01
131 |         )
132 |         return DynaQAgent(
133 |                 policy=policy,
134 |                 alpha=0.1,
135 |                 gamma=0.95,
136 |                 n_plan_iter=50,
137 |                 random_state=np.random.RandomState(None)
138 |         )
139 | 
140 | 
141 |     dyna_q_curve = blocking_maze_learning_curve(dyna_q_agent_maker)
142 |     dyna_q_plus_curve = blocking_maze_learning_curve(dyna_q_plus_agent_maker)
143 |     altered_dyna_q_plus_curve = blocking_maze_learning_curve(altered_dyna_q_plus_agent_maker)
144 | 
145 |     with plt.rc_context(plotting.rc()):
146 |         fig, ax = plt.subplots(1)
147 |         ax.plot(np.mean(dyna_q_curve, axis=1), label="Dyna Q")
148 |         ax.plot(np.mean(dyna_q_plus_curve, axis=1), label="Dyna Q Plus")
149 |         ax.plot(np.mean(altered_dyna_q_plus_curve, axis=1), label="Altered Dyna Q Plus")
150 |         ax.grid(alpha=0.1)
151 |         ax.axvline(1000, color='k', zorder=-1, ls='--', label='Block')
152 |         ax.legend()
153 | 
154 |         ax.set_xlabel("Time Step")
155 |         ax.set_ylabel("Cumulative Reward")
156 |         ax.set_title("Blocking Maze Example", fontsize=14)
157 | 
158 |         plotting.savefig(
159 |                 fig,
160 |                 os.path.join(
161 |                         c.Paths.output,
162 |                         'ex_8_4',
163 |                         'dyna_q_comparison.png'
164 |                 )
165 |         )
166 | 
167 |     # ---------------------------
168 |     # Comparison on a static maze
169 |     # ---------------------------
170 |     # grid = blocked_grid(open='left')
171 |     # environment = Gridworld(grid=grid, start_position=(5, 3), goal_position=(0, 8))
172 |     #
173 |     # dynaq_cumulative_rewards = learning_curve(
174 |     #         dyna_q_agent_maker,
175 |     #         environment,
176 |     #         n_steps=3000,
177 |     #         n_iters=10
178 |     # )
179 |     #
180 |     # dynaqplus_cumulative_rewards = learning_curve(
181 |     #         dyna_q_plus_agent_maker,
182 |     #         environment,
183 |     #         n_steps=3000,
184 |     #         n_iters=10
185 |     # )
186 |     #
187 |     # plt.plot(np.mean(dynaq_cumulative_rewards, axis=1), label='Dyna-Q')
188 |     # plt.plot(np.mean(dynaqplus_cumulative_rewards, axis=1), label='Dyna-Q+')
189 |     # plt.legend()
190 |     # print('ready')
191 |     # plt.show()
192 | 
193 |     # ---------------
194 |     # running a single example
195 |     # ---------------
196 |     # grid = blocked_grid(open='left')
197 |     # environment = Gridworld(grid=grid, start_position=(5, 3), goal_position=(0, 8))
198 |     # print(grid)
199 |     #
200 |     # a = altered_dyna_q_plus_agent_maker(grid.shape)
201 |     # results = utils.run_continuous(a, environment, n_steps=3000)
202 |     #
203 |     # a.set_policy_type(policies.GreedyPolicy)
204 |     # greedy_episode = utils.run_episode(a, environment, update=False)
205 |     #
206 |     # print('-------------------------------------------------')
207 |     # print("Greedy Trajectory")
208 |     # g = mark_trajectory(grid, greedy_episode.states, 2)
209 |     # print(g)
210 |     #
211 |     # fig, ax = plt.subplots(1)
212 |     # ax.imshow(g)
213 |     # ax.set_title("Greedy Episode")
214 |     # plt.show()
215 | 


--------------------------------------------------------------------------------
/code/exercises/ex_8_8/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 12/07/2018 10:33
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 11/04/2018 18:11
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/tests/ex_2_5/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 11/04/2018 18:11
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/exercises/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 13/07/2018 16:48
 6 | ---------------------------------
 7 | 
 8 | """
 9 | import logging
10 | import pickle
11 | 
12 | 
13 | class Bunch(dict):
14 |     def __init__(self, *args, **kwargs):
15 |         super().__init__(*args, **kwargs)
16 |         self.__dict__ = self
17 | 
18 | 
19 | def configure_stream_logger(logger, level=logging.DEBUG):
20 |     logger.setLevel(level)
21 |     ch = logging.StreamHandler()
22 |     ch.setLevel(logging.DEBUG)
23 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
24 |     ch.setFormatter(formatter)
25 |     logger.addHandler(ch)
26 |     return None
27 | 
28 | 
29 | def read_pickle(path):
30 |     with open(path, 'rb') as f:
31 |         return pickle.load(f)
32 | 
33 | 
34 | def to_pickle(data, path):
35 |     with open(path, 'wb') as f:
36 |         return pickle.dump(data, f)
37 | 


--------------------------------------------------------------------------------
/code/generic/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 26/06/2018 17:27
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/generic/agents/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | --------------------------------
4 | project: code
5 | created: 10/07/2018 15:56
6 | ---------------------------------
7 | 
8 | """
9 | 


--------------------------------------------------------------------------------
/code/generic/agents/dyna_q.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 10/07/2018 15:50
  6 | ---------------------------------
  7 | 
  8 | """
  9 | from collections import defaultdict
 10 | 
 11 | import numpy as np
 12 | 
 13 | from generic import utils, updates
 14 | 
 15 | """
 16 | - later should refactor out the model into a class that can be updated
 17 | 
 18 | """
 19 | 
 20 | 
 21 | class DynaQAgent:
 22 |     def __init__(self, policy, n_plan_iter, alpha, gamma, random_state=None):
 23 |         self.policy = policy
 24 |         self.n_plan_iter = n_plan_iter
 25 |         self.alpha = alpha
 26 |         self.gamma = gamma
 27 |         self.random_state = random_state or np.random.RandomState(seed=0)
 28 | 
 29 |         self._model_table = defaultdict(dict)
 30 | 
 31 |     def set_policy_type(self, policy_class, **params):
 32 |         self.policy = policy_class(self.policy.action_values, **params)
 33 |         return None
 34 | 
 35 |     def choose_action(self, state):
 36 |         return self.policy(state)
 37 | 
 38 |     def model(self, state, action):
 39 |         return self._model_table[state][action]
 40 | 
 41 |     def update(self, old_state, action, reward, new_state):
 42 |         self.update_action_values(old_state, action, reward, new_state)
 43 |         self.update_model(old_state, action, reward, new_state)
 44 |         self.plan(n=self.n_plan_iter)
 45 |         return None
 46 | 
 47 |     def plan(self, n):
 48 |         for _ in range(n):
 49 |             state = utils.choose_from(list(self._model_table.keys()), self.random_state)
 50 |             action = utils.choose_from(list(self._model_table[state].keys()), self.random_state)
 51 |             reward, new_state = self.model(state, action)
 52 |             self.update_action_values(state, action, reward, new_state)
 53 |         return None
 54 | 
 55 |     def update_action_values(self, old_state, action, reward, new_state):
 56 |         updates.q_learning(
 57 |                 action_values=self.policy.action_values,
 58 |                 old_state=old_state,
 59 |                 action=action,
 60 |                 reward=reward,
 61 |                 new_state=new_state,
 62 |                 alpha=self.alpha,
 63 |                 gamma=self.gamma
 64 |         )
 65 |         return None
 66 | 
 67 |     def update_model(self, old_state, action, reward, new_state):
 68 |         self._model_table[old_state][action] = reward, new_state
 69 |         return None
 70 | 
 71 | 
 72 | class DynaQPlusAgent(DynaQAgent):
 73 |     def __init__(self, policy, n_plan_iter, alpha, gamma, kappa=0.01, random_state=None):
 74 |         super().__init__(policy, n_plan_iter, alpha, gamma, random_state)
 75 | 
 76 |         self.kappa = kappa
 77 |         self._time_last_visited_table = defaultdict(dict)
 78 |         self._time_step = 0
 79 | 
 80 |     def choose_action(self, state):
 81 |         self._time_step += 1
 82 |         return super().choose_action(state)
 83 | 
 84 |     def time_since_visited(self, state, action):
 85 |         try:
 86 |             t = self._time_last_visited_table[state][action]
 87 |         except KeyError:
 88 |             t = -1
 89 |         return self._time_step - t
 90 | 
 91 |     def update(self, old_state, action, reward, new_state):
 92 |         super().update(old_state, action, reward, new_state)
 93 |         self._time_last_visited_table[old_state][action] = self._time_step
 94 |         return None
 95 | 
 96 |     def model(self, state, action):
 97 |         """Altered simulation rewards for time since taken action"""
 98 |         reward, new_state = super().model(state, action)
 99 |         r = reward + self.kappa * np.sqrt(self.time_since_visited(state, action))
100 |         return r, new_state
101 | 


--------------------------------------------------------------------------------
/code/generic/environments.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 10/07/2018 15:52
 6 | ---------------------------------
 7 | 
 8 | """
 9 | 
10 | 
11 | class Gridworld:
12 |     """Gridworld environment with walls. Suitable for building mazes."""
13 |     def __init__(self, grid, start_position, goal_position, block_flag=1):
14 |         self.grid = grid
15 |         self.start_position = start_position
16 |         self.goal_position = goal_position
17 |         self.block_flag = block_flag
18 | 
19 |         self.current_state = self.start_position
20 | 
21 |     def reset(self):
22 |         self.current_state = self.start_position
23 |         return None
24 | 
25 |     def step(self, action):
26 |         dx, dy = action
27 |         x, y = self.current_state
28 |         new_position = x + dx, y + dy
29 |         if not self._on_grid(new_position) or self.grid[new_position] == self.block_flag:
30 |             new_position = self.current_state
31 |         self.current_state = new_position
32 |         return new_position, self._reward(new_position), self._done(new_position)
33 | 
34 |     def _done(self, position):
35 |         return position == self.goal_position
36 | 
37 |     def _reward(self, position):
38 |         return -1. if position != self.goal_position else 0.
39 | 
40 |     def _on_grid(self, position):
41 |         x, y = position
42 |         return 0 <= x < self.grid.shape[0] and 0 <= y < self.grid.shape[1]


--------------------------------------------------------------------------------
/code/generic/policies.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 26/06/2018 17:26
 6 | ---------------------------------
 7 | 
 8 | """
 9 | from collections import defaultdict
10 | 
11 | import numpy as np
12 | 
13 | from generic.utils import choose_from
14 | 
15 | 
16 | def greedy(state_action_values):
17 |     return max(state_action_values, key=lambda k: state_action_values[k])
18 | 
19 | 
20 | # not sure if these policy classes should keep references to the action values. Maybe not...
21 | class GreedyPolicy:
22 |     def __init__(self, action_values, cache=False):
23 |         """
24 |         A greedy policy.
25 | 
26 |         Args:
27 |             action_values (dict): Action Values
28 |             cache (bool): Whether to cache the policy decisions, useful if you only want to evaluate the policy and
29 |                 you won't be updating the action values
30 |         """
31 |         self.action_values = action_values
32 |         self._cache = cache
33 | 
34 |         if cache:
35 |             self._greedy = dict()
36 | 
37 |     def _cache_call(self, state):
38 |         if state not in self._greedy:
39 |             self._greedy[state] = greedy(self.action_values[state])
40 |         return self._greedy[state]
41 | 
42 |     def __call__(self, state):
43 |         return self._cache_call(state) if self._cache else greedy(self.action_values[state])
44 | 
45 | 
46 | class EpsilonGreedyPolicy:
47 |     def __init__(self, action_values, epsilon, random_state=None):
48 |         self.action_values = action_values
49 |         self.epsilon = epsilon
50 |         self.random_state = random_state or np.random.RandomState(seed=0)
51 | 
52 |     def explore(self):
53 |         return self.random_state.binomial(n=1, p=self.epsilon) == 1
54 | 
55 |     def __call__(self, state):
56 |         if self.explore():
57 |             return choose_from(list(self.action_values[state].keys()), self.random_state)
58 |         else:
59 |             return greedy(self.action_values[state])
60 | 
61 | 
62 | class TimeBiasedPolicy:
63 |     def __init__(self, action_values, kappa):
64 |         self.action_values = action_values
65 | 
66 |         self.kappa = kappa
67 |         self._time_last_visited_table = defaultdict(dict)
68 |         self._time_step = 0
69 | 
70 |     def time_since_visited(self, state, action):
71 |         try:
72 |             t = self._time_last_visited_table[state][action]
73 |         except KeyError:
74 |             t = -1
75 |         return self._time_step - t
76 | 
77 |     def __call__(self, state):
78 |         modified_av = {
79 |             a: v + self.kappa * np.sqrt(self.time_since_visited(state, a))
80 |             for a, v in self.action_values[state].items()
81 |         }
82 |         chosen_action = greedy(modified_av)
83 |         self._time_last_visited_table[state][chosen_action] = self._time_step
84 |         self._time_step += 1
85 |         return chosen_action
86 | 


--------------------------------------------------------------------------------
/code/generic/updates.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 27/06/2018 11:43
 6 | ---------------------------------
 7 | 
 8 | update action-value dictionaries in-place
 9 | 
10 | """
11 | from generic.policies import greedy
12 | 
13 | 
14 | def sarsa(action_values, old_state, action, reward, new_state, new_action, alpha, gamma):
15 |     """update action values in-place"""
16 |     action_values[old_state][action] += alpha * (
17 |             reward + gamma * action_values[new_state][new_action] - action_values[old_state][action]
18 |     )
19 |     return None
20 | 
21 | 
22 | def q_learning(action_values, old_state, action, reward, new_state, alpha, gamma):
23 |     new_action = greedy(action_values[new_state])
24 |     action_values[old_state][action] += alpha * (
25 |             reward + gamma * action_values[new_state][new_action] - action_values[old_state][action]
26 |     )
27 |     return None
28 | 


--------------------------------------------------------------------------------
/code/generic/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 26/06/2018 17:29
 6 | ---------------------------------
 7 | 
 8 | """
 9 | from collections import namedtuple
10 | 
11 | Results = namedtuple('Results', ['states', 'actions', 'rewards'])
12 | 
13 | 
14 | def choose_from(seq, random_state):
15 |     """to get around numpy interpreting list of tuples as an array"""
16 |     return seq[random_state.choice(len(seq))]
17 | 
18 | 
19 | def run_episode(agent, environment, maxiter=10 ** 5, update=True):
20 |     environment.reset()
21 | 
22 |     states = [environment.current_state]
23 |     actions = list()
24 |     rewards = list()
25 |     for _ in range(maxiter):
26 |         old_state = environment.current_state
27 |         action = agent.choose_action(state=old_state)
28 |         new_state, reward, done = environment.step(action)
29 | 
30 |         if update:
31 |             agent.update(old_state, action, reward, new_state)
32 | 
33 |         states.append(new_state)
34 |         actions.append(action)
35 |         rewards.append(reward)
36 | 
37 |         if done:
38 |             break
39 | 
40 |     return Results(
41 |             states=states,
42 |             rewards=rewards,
43 |             actions=actions
44 |     )
45 | 
46 | 
47 | def run_continuous(agent, environment, n_steps, update=True):
48 |     environment.reset()
49 | 
50 |     states = [environment.current_state]
51 |     actions = list()
52 |     rewards = list()
53 |     for _ in range(n_steps):
54 |         old_state = environment.current_state
55 |         action = agent.choose_action(state=old_state)
56 |         new_state, reward, done = environment.step(action)
57 | 
58 |         if update:
59 |             agent.update(old_state, action, reward, new_state)
60 | 
61 |         actions.append(action)
62 |         rewards.append(reward)
63 |         states.append(new_state)
64 | 
65 |         if done:
66 |             environment.reset()
67 | 
68 |     return Results(
69 |             states=states,
70 |             actions=actions,
71 |             rewards=rewards
72 |     )
73 | 


--------------------------------------------------------------------------------
/code/plotting.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | --------------------------------
 4 | project: code
 5 | created: 12/04/2018 12:47
 6 | ---------------------------------
 7 | 
 8 | """
 9 | 
10 | 
11 | def rc(kwds={}):
12 |     default_kwds = {
13 |         'figure.figsize': (20, 10),
14 |         'font.size': 12
15 |     }
16 |     default_kwds.update(kwds)
17 |     return kwds
18 | 
19 | 
20 | def multi_ax_legend(*ax, **kwargs):
21 |     ax = list(ax)
22 |     ax0 = ax.pop(0)
23 |     lins, labs = ax0.get_legend_handles_labels()
24 |     for a in ax:
25 |         morelins, morelabs = a.get_legend_handles_labels()
26 |         lins.extend(morelins)
27 |         labs.extend(morelabs)
28 |     return ax0.legend(lins, labs, **kwargs)
29 | 
30 | 
31 | def savefig(fig, path, **kwargs):
32 |     kws = dict(
33 |             # dpi=1000,
34 |             # format="eps",
35 |             bbox_inches="tight"
36 |     )
37 |     kws.update(kwargs)
38 |     return fig.savefig(
39 |             path,
40 |             **kws
41 |     )
42 | 


--------------------------------------------------------------------------------
/code/requirements.txt:
--------------------------------------------------------------------------------
 1 | cycler==0.10.0
 2 | kiwisolver==1.0.1
 3 | matplotlib==2.2.2
 4 | nose==1.3.7
 5 | numpy==1.14.2
 6 | pandas==0.22.0
 7 | pyparsing==2.2.0
 8 | python-dateutil==2.7.2
 9 | pytz==2018.4
10 | six==1.11.0
11 | 


--------------------------------------------------------------------------------
/code/tests/bandits/crappy_tests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | --------------------------------
  4 | project: code
  5 | created: 11/04/2018 15:15
  6 | ---------------------------------
  7 | 
  8 | These tests are very informal
  9 | 
 10 | """
 11 | import unittest
 12 | 
 13 | import numpy as np
 14 | 
 15 | from bandits import EpsilonGreedyActor, SampleAverageEstimator, ActionValueBanditAgent
 16 | 
 17 | 
 18 | class EpsilonGreedyActorTestCase(unittest.TestCase):
 19 |     def test_proportion_exploration(self):
 20 |         epsilon = 0.01
 21 | 
 22 |         actor = EpsilonGreedyActor(
 23 |                 n_actions=4,
 24 |                 random_state=np.random.RandomState(None),
 25 |                 epsilon=epsilon
 26 |         )
 27 | 
 28 |         optimal_action = 3
 29 |         n = 10000
 30 | 
 31 |         choices = list()
 32 |         for i in range(n):
 33 |             choices.append(actor.action([optimal_action]))
 34 | 
 35 |         prop = np.sum(np.array(choices) == optimal_action)
 36 | 
 37 |         print(1 - prop/n)
 38 |         print(epsilon)
 39 |         return None
 40 | 
 41 | 
 42 | class SampleAverageEstimatorTestCase(unittest.TestCase):
 43 |     def test_convergence(self):
 44 |         mean = 0.98
 45 | 
 46 |         state = np.random.RandomState(0)
 47 |         samples = state.normal(loc=mean, scale=1, size=int(1e5))
 48 | 
 49 |         estimator = SampleAverageEstimator(default_value=0)
 50 | 
 51 |         for x in samples:
 52 |             estimator.update(x)
 53 | 
 54 |         print('Estimator value is', estimator.value)
 55 |         print('Sample mean is', np.mean(samples))
 56 |         print('True mean is', mean)
 57 |         return None
 58 | 
 59 | 
 60 | class BanditAgentTestCase(unittest.TestCase):
 61 |     pass
 62 |     # CHANGE THIS INTO SOME SHORT TEST
 63 | 
 64 |     # print(np.sum(choices == optimal_actions) / n_steps)
 65 |     #
 66 |     # print(samples)
 67 |     #
 68 |     # print(np.c_[[choices, optimal_actions, explore]].T)
 69 | 
 70 |     #
 71 |     # print('------------------------------')
 72 |     # print('Results')
 73 |     # print('------------------------------')
 74 |     # print('{:<10} {:<10} {:<10}'.format('choice', 'expected_reward', 'optimal'))
 75 |     # print('------------------------------')
 76 |     # choices = list()
 77 |     # for row in samples:
 78 |     #     choice = agent.action()
 79 |     #     choices.append(choice)
 80 |     #     expected_reward = row[choice]
 81 |     #     agent.update(choice, expected_reward)
 82 |     #     optimal = np.argmax(row)
 83 |     #     print('{:<10} {:<10} {:<10}'.format(choice, '{:.2f}'.format(expected_reward), optimal))
 84 |     # print('------------------------------')
 85 | 
 86 | 
 87 | 
 88 |     # write a small test for the agent thing
 89 |     # figure out a way to do the analysis
 90 | 
 91 | 
 92 |     # agent.update(0, 10)
 93 |     # print(agent.get_estimates())
 94 |     # print(agent.action())
 95 |     #
 96 |     # agent.update(0, 1)
 97 |     # print(agent.get_estimates())
 98 |     # print(agent.action())
 99 |     #
100 |     # e = agent.estimators[0]
101 |     # print(e.n_updates)
102 |     # n_bandits = 1 #10
103 |     # n_steps = int(1e5)
104 |     # sampler = RandomWalkingValueSampler(n_bandits=n_bandits, n_steps=n_steps)
105 |     # samples = sampler(initial_values=np.zeros(n_bandits))
106 | 
107 | if __name__ == '__main__':
108 |     unittest.main()
109 | 


--------------------------------------------------------------------------------
/exercises/chapters/chapter1/chapter1_content.tex:
--------------------------------------------------------------------------------
 1 | \section{Introduction}
 2 | \subsection{Exercise 1.1: Self-Play}
 3 | \subsubsection*{Q}
 4 | Suppose, instead of playing against a random opponent, the reinforcement learning algorithm described above played against itself, with both sides learning. What do you think would happen in this case? Would it learn a different policy for selecting moves?
 5 | 
 6 | \subsubsection*{A}
 7 | \begin{itemize}
 8 |     \item Would learn a different policy than playing a fixed opponent since the opponent would also be changing in this case.
 9 |     \item May not be able to learn an optimal strategy as the opponent keeps changing also.
10 |     \item Could get stuck in loops.
11 |     \item Policy could remain static since on average they would draw each iteration.
12 | \end{itemize}
13 | 
14 | \subsection{Exercise 1.2: Symmetries}
15 | \subsubsection*{Q}
16 | Many tic-tac-toe positions appear different but are really the same because of symmetries. How might we amend the learning process described above to take advantage of this? In what ways would this change improve the learning process? Now think again. Suppose the opponent did not take advantage of symmetries. In that case, should we? Is it true, then, that symmetrically equivalent positions should necessarily have the same value?
17 | 
18 | \subsubsection*{A}
19 | \begin{itemize}
20 |     \item We could label the states as unique up to symmetries so that our search space is smaller, this way we will get a better estimate of optimal play.
21 |     \item If we are playing an opponent who does not take symmetries into account when they are playing then we should not label the states as the same since the opponent is part of the environment and the environment is not the same in those states.
22 | \end{itemize}
23 | 
24 | \subsection{Exercise 1.3: Greedy Play}
25 | \subsubsection*{Q}
26 | Suppose the reinforcement learning player was greedy, that is, it always played the move that brought it to the position that it rated the best. Might it learn to play better, or worse, than a nongreedy player? What problems might occur
27 | 
28 | \subsubsection*{A}
29 | \begin{itemize}
30 |     \item The greedy player will not explore, so will in general perform worse than the non-greedy player
31 |     \item If the greedy player had a perfect estimate of the value of states then this would be fine.
32 | \end{itemize}
33 | 
34 | \subsection{Exercise 1.4: Learning from Exploration}
35 | \subsubsection*{Q}
36 | Suppose learning updates occurred after all moves, including exploratory moves. If the step-size parameter is appropriately reduced over time (but not the tendency to explore), then the state values would converge to a set of probabilities. What are the two sets of probabilities computed when we do, and when we do not, learn from exploratory moves? Assuming that we do continue to make exploratory moves, which set of probabilities might be better to learn? Which would result in more wins?
37 | \subsubsection*{A}
38 | I think that an estimate for the probability of the state producing a win should be based on the optimal moves from that state.
39 | \begin{itemize}
40 |     \item The one in which we only record the optimal moves is the probability of our optimal agent winning. If we include exploration then this is the probability of the training agent winning.
41 |     \item Better to learn the probability of winning with no exploration since this is how the agent will perform in real time play.
42 |     \item Updating from optimal moves only will increase probability of winning.
43 | \end{itemize}
44 | 
45 | \subsection{Exercise 1.5: Other Improvements}
46 | \subsubsection*{Q}
47 | Can you think of other ways to improve the reinforcement learning player? Can you think of any better way to solve the tic-tac-toe problem as posed?
48 | 
49 | \subsubsection*{A}
50 | I'm not too sure here...
51 | \begin{itemize}
52 |     \item We could rank the draws as better than the losses.
53 |     \item We might like to try running multiple iterations of games before updating our weights as this might give a better estimate.
54 | \end{itemize}
55 | 


--------------------------------------------------------------------------------
/exercises/chapters/chapter10/chapter10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter10/chapter10.pdf


--------------------------------------------------------------------------------
/exercises/chapters/chapter10/chapter10.tex:
--------------------------------------------------------------------------------
1 | \input{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction/header}
2 | 
3 | \begin{document}
4 |     \include{chapter10_content}
5 | \end{document}


--------------------------------------------------------------------------------
/exercises/chapters/chapter10/chapter10_content.tex:
--------------------------------------------------------------------------------
  1 | \section{On-policy Control with Approximation}
  2 | 
  3 | \subsection{Exercise 10.1}
  4 | \subsubsection*{Q}
  5 | We have not explicitly considered or given pseudocode for any Monte Carlo methods or in this chapter. What would they be like? Why is it reasonable not to give pseudocode for them? How would they perform on the Mountain Car task?
  6 | 
  7 | \subsubsection*{A}
  8 | \begin{itemize}
  9 |     \item Monte Carlo is $n$-step Sarsa with $n \to \infty$
 10 |     \item This is same pseudocode as given, but with full episodes and $G_t$ rather than $G_{t:t+n}$.
 11 |     \item Could have been very poor on the mountain car as may never have finished the first episode and does not learn within an episode (online)
 12 | \end{itemize}
 13 | 
 14 | \subsection{Exercise 10.2}
 15 | \subsubsection*{Q}
 16 | Give pseudocode for semi-gradient one-step \emph{Expected Sarsa} for control.
 17 | 
 18 | \subsubsection*{A}
 19 | Expected sarsa is the same but the target is 
 20 | \[
 21 |     \sum_{k=t}^{t +n -1} \gamma^{k-t} R_{k + 1} + \sum_a \pi(a \vert{} S_{t+n}) q_{t + n -1}(S_{t+n}, a)
 22 | \]
 23 | 
 24 | \subsection{Exercise 10.3}
 25 | \subsubsection*{Q}
 26 | Why do the results shown in Figure 10.4 have higher standard errors at large $n$ than at small $n$?
 27 | \subsubsection*{A}
 28 | The longer the step length then the greater the variance in initial runs, this is because the agent needs to wait for $n$ steps to start learning. Some initial episodes of high $n$ cases could have been very poor.
 29 | 
 30 | \subsection{Exercise 10.4}
 31 | \subsubsection*{Q}
 32 | Give pseudocode for a differential version of semi-gradient Q-learning.
 33 | 
 34 | \subsubsection*{A}
 35 | Same as others but with the target
 36 | \[
 37 |     R_{t+1} - \bar{R}_{t+1} - \max_a \hat{q}(S_{t+1}, a, \vec{w}_t)
 38 | \]
 39 | 
 40 | \subsection{Exercise 10.5}
 41 | \subsubsection*{Q}
 42 | What equations are needed (beyond 10.10) to specify the differential version of TD(0)?
 43 | 
 44 | \subsubsection*{A}
 45 | Just need the semi-gradient update
 46 | \[
 47 |     \vec{w}_{t+1} = \vec{w}_{t} + \alpha \delta_t \grad_{\vec{w}_t} \hat{v}(S_t, \vec{w}_t)
 48 | \]
 49 | where
 50 | \[
 51 |     \delta_t = R_{t+1} - \bar{R}_{t+1} + \hat{v}(S_{t+1}, \vec{w}_{t}) - \hat{v}(S_{t}, \vec{w}_t)
 52 | \]
 53 | 
 54 | \subsection{Exercise 10.6}
 55 | \subsubsection*{Q}
 56 | Consider a Markov reward process consisting of a ring of three states A, B, and C, with state transitions going deterministically around the ring. A reward of 1 is received upon arrival in A and otherwise the reward is 0. What are the differential values of the three states?
 57 | \subsubsection*{A}
 58 | The average reward is $\bar{R} = \frac13$. To calculate the differential return we have 
 59 | \[
 60 |     V(A) = \sum_t (a_t - \bar{R})
 61 | \]
 62 | where $a_i = \mathds{1}\{i + 1 \equiv 0 \pmod 3\}$. This doesn't converge in the normal way, so to attempt to calculate it let's consider
 63 | \[
 64 |     V(A; \gamma) = \sum_t \gamma^t \left( a_t - \frac13\right)
 65 | \]
 66 | then, formally, we have
 67 | \[
 68 |     \lim_{\gamma \to 1} V(A; \gamma) = V(A).
 69 | \]
 70 | Now 
 71 | \begin{align*}
 72 |     V(A; \gamma) &= - \frac13 - \frac13 \gamma  + \frac23 \gamma^2 + \sum_{t=3}^{\infty} \gamma^t \left(a_t - \frac13\right) \\
 73 |                  &= \frac13 (2 \gamma^2 - \gamma - 1) + \gamma ^3 \sum_{t=0}^{\infty} \gamma^t \left(a_t - \frac13\right)
 74 | \end{align*}
 75 | so
 76 | \begin{align*}
 77 |     V(A; \gamma) &= \frac13 \frac{2 \gamma^2 - \gamma - 1}{1 - \gamma^3} \\
 78 |                  &= -\frac13 \frac{2 \gamma + 1}{\gamma^2 + \gamma +1}
 79 | \end{align*}
 80 | which leads to $V(A) = -\frac13$. \\
 81 | 
 82 | Then we have 
 83 | \[
 84 |     V(A) = -\frac13 + V(B) \quad \implies \quad V(B) = 0
 85 | \]
 86 | and
 87 | \[
 88 |     V(B) = -\frac13 + V(C) \quad \implies \quad V(C) = \frac13.
 89 | \]
 90 | 
 91 | \subsection{Exercise 10.7}
 92 | \subsubsection*{Q}
 93 | Suppose there is an MDP that under any policy produces the deterministic sequence of rewards 1, 0, 1, 0, 1, 0, . . . going on forever. Technically, this is not allowed because it violates ergodicity; there is no stationary limiting distribution $\mu_\pi$ and the limit (10.7) does not exist. Nevertheless, the average reward (10.6) is well defined; What is it? Now consider two states in this MDP. From A, the reward sequence is exactly as described above, starting with a 1, whereas, from B, the reward sequence starts with a 0 and then continues with 1, 0, 1, 0, . . .. The differential return (10.9) is not well defined for this case as the limit does not exist. To repair this, one could alternately define the value of a state as
 94 | 
 95 | \[
 96 |     v_\pi(s) \doteq \lim_{\gamma \to 1} \lim_{h \to \infty} \sum_{t=0}^h \gamma^t \left( \Epi{}[R_{t+1} \vert{} S_0 = s] - r(\pi) \right).
 97 | \]
 98 | 
 99 | Under this definition, what are the values of states A and B?
100 | \subsubsection*{A}
101 | Define
102 | \[
103 |     f(h) = \frac{1}{2h} \sum_{t=0}^{2h} \mathds{1}\{t \equiv 0 \pmod 2\} = \frac{h + 1}{2h}
104 | \]
105 | then
106 | \[
107 |     \bar{R} = \lim_{h \to \infty}f(h/2) = \lim_{h \to \infty}f(h) = \frac12.
108 | \]
109 | Now to compute the differential state values we write
110 | \[
111 |     V(S; \gamma) = \lim_{h\to \infty} \sum_{t=0}^h \gamma^t \left( \E{}[R_{t+1} \vert{} S_0 = s] - \bar{R} \right)
112 | \]
113 | then
114 | \begin{align*}
115 |     V(A; \gamma) &= 1 - \bar{R} + \gamma V(B; \gamma) \\ 
116 |     V(B; \gamma) &= - \bar{R} + \gamma V(A; \gamma)
117 | \end{align*}
118 | so
119 | \[
120 |     V(A; \gamma) = \frac12 ( 1 - \gamma ) - \gamma^2 V(A; \gamma)
121 | \]
122 | and
123 | \begin{align*}
124 |     V(A; \gamma) &= \frac12 \frac{1 - \gamma}{1 - \gamma^2} \\
125 |                  &= \frac{1}{2(1 + \gamma)}.
126 | \end{align*}
127 | Finally, $V(A) = \lim_{\gamma \to 1} V(A; \gamma) = \frac14$ and $V(B) = - \frac14$.
128 |     
129 | \subsection{Exercise 10.8}
130 | \subsubsection*{Q}
131 | The pseudocode in the box on page 251 updates $\bar{R}_{t+1}$ using $\delta_t$ as an error rather than simply $R_{t+1} - \bar{R}_{t+1}$. Both errors work, but using $\delta_t$ is better. To see why, consider the ring MRP of three states from Exercise 10.6. The estimate of the average
132 | reward should tend towards its true value of $\frac13$. Suppose it was already there and was held stuck there. What would the sequence of $R_{t+1} - \bar{R}_{t+1}$ errors be? What would the sequence of $\delta_t$ errors be (using (10.10))? Which error sequence would produce a more stable estimate of the average reward if the estimate were allowed to change in response to the errors? Why?
133 |  
134 | \subsubsection*{A}
135 | $\bar{R} = \frac13$ fixed. \\
136 | 
137 | The sequence of errors from $R_t - \bar{R}_t$ starting in A would be 
138 | \[
139 |     -\frac13, -\frac13, \frac23, -\frac13, -\frac13, \frac23, \dots
140 | \]
141 | while the sequence of TD errors starting in A (taking differential values from Exercise 10.6) would be
142 | \[
143 |     0, 0, 0, 0, 0, 0, \dots
144 | \]
145 | which is clearly of much lower variance and would therefore give more stable updates. Once $\bar{R}$ gets to the correct value it never leaves.
146 | 
147 | \subsection{Exercise 10.9}
148 | \subsubsection*{Q}
149 | In the differential semi-gradient $n$-step Sarsa algorithm, the step-size parameter on the average reward, $\beta$, needs to be quite small so that $\bar{R}$ becomes a good long-term estimate of the average reward. Unfortunately, $\bar{R}$ will then be biased by its initial value for many steps, which may make learning inefficient. Alternatively, one could use a sample average of the observed rewards for $\bar{R}$ . That would initially adapt rapidly but in the long run would also adapt slowly. As the policy slowly changed, $\bar{R}$ would also change; the potential for such long-term non-stationarity makes sample-average methods ill-suited. In fact, the step-size parameter on the average reward is a perfect place to use the unbiased constant-step-size trick from Exercise 2.7. Describe the specific changes needed to the boxed algorithm for differential semi-gradient $n$-step Sarsa to use this trick.
150 | 
151 | \subsubsection*{A}
152 | We define a parameter $\beta$ and seed a sequence $u_n$ with $u_0=0$. The under the if statement where $\tau \geq 0$ we place the following:
153 | \begin{align*}
154 |     u &\leftarrow u + \beta (1 - u) \\
155 |     \bar{R} & \leftarrow \bar{R} + \frac{\beta}{\mu} (R - \bar{R})
156 | \end{align*}
157 | 
158 |     
159 |     
160 | 


--------------------------------------------------------------------------------
/exercises/chapters/chapter11/chapter11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter11/chapter11.pdf


--------------------------------------------------------------------------------
/exercises/chapters/chapter11/chapter11.tex:
--------------------------------------------------------------------------------
1 | \input{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction/header}
2 | 
3 | \begin{document}
4 |     \include{chapter11_content}
5 | \end{document}


--------------------------------------------------------------------------------
/exercises/chapters/chapter11/chapter11_content.tex:
--------------------------------------------------------------------------------
 1 | \section{*Off-policy Methods with Approximation}
 2 | 
 3 | \subsection{Exercise 11.1}
 4 | \subsubsection*{Q}
 5 | Convert the equation of $n$-step off-policy TD (7.9) to semi-gradient form. Give accompanying definitions of the return for both the episodic and continuing cases.
 6 | \subsubsection*{A}
 7 | Tabular case is 
 8 | \[ 
 9 |     V_{t+n}(S_t) = V_{t+n-1} + \alpha \rho_{t:t+n-1} [G_{t:t+n} - V_{t+n-1}(S_t)].
10 | \]
11 | The semi-gradient weight update is
12 | \[
13 |     \vec{w}_{t+n} = \vec{w}_{t+n-1} + \alpha \rho_{t:t+n-1}[G_{t:t+n} - \hat{v}(S_t, \vec{w}_{t+n-1})] \grad_{\vec{w}}\hat{v}(S_t, \vec{w}_{t+n-1}),
14 | \]
15 | noting the occurrence of the $n$step TD Error
16 | \[
17 |     \delta_t^n = G_{t:t+n} - \hat{v}(S_t, \vec{w}_{t+n-1}).
18 | \]
19 | We define the returns in the two cases
20 | \begin{description}
21 |     \item[episodic] $G_{t:t+n} = \sum_{i=t}^{t+n-1}\gamma_{i-t}R_{i+1} + \gamma^n \hat{v}(S_{t+n}, \vec{w}_{t+n-1})$
22 |     \item[continuing] $G_{t:t+n} = \sum_{i=t}^{t+n-1}(R_{i+1} - \bar{R}_i) + \hat{v}(S_{t+n}, \vec{w}_{t+n-1})$
23 | \end{description}
24 | where in each case $G_{t:h} = G_t$ if $h \geq T$.
25 |     
26 | 
27 | \subsection{*Exercise 11.2}
28 | \subsubsection*{Q}
29 | Convert the equations of $n$-step Q$(\sigma)$ (7.11 and 7.17) to semi-gradient form. Give definitions that cover both the episodic and continuing cases.
30 | 
31 | \subsubsection*{A}
32 | The update is 
33 | \[
34 |     \vec{w}_{t+n} = \vec{w}_{t+n-1} + \alpha [G_{t:t+n} - \hat{q}(S_t, A_t, \vec{w}_{t+n-1})] \grad_{\vec{w}} \hat{q}(S_t, A_t, \vec{w}_{t+n-1}
35 | \]
36 | with the following definitions of returns targets\\
37 | 
38 | {\bfseries Episodic}\\
39 | \[
40 |     G_{t:h} = R_{t+1} + \gamma \left[\sigma_{t+1}\rho_{t+1}  + (1-\sigma_{t+1})\pi(A_{t+1} \vert{} S_{t+1})\right]\left[G_{t:h} - \hat{q}(S_t, A_t, \vec{w}_{h-1}) \right] + \gamma \bar{V}_{h-1}(S_{t+1})
41 | \]\\
42 | 
43 | {\bfseries Continuing}\\
44 | \[
45 |     G_{t:h} = R_{t+1} - \bar{R}_t + \left[\sigma_{t+1}\rho_{t+1}  + (1-\sigma_{t+1})\pi(A_{t+1} \vert{} S_{t+1})\right]\left[G_{t:h} - \hat{q}(S_t, A_t, \vec{w}_{h-1}) \right] + \bar{V}_{h-1}(S_{t+1})
46 | \]\\
47 | 
48 | where 
49 | \[
50 |     \bar{V}_i(s) = \sum_a \pi(a \vert{} s) \hat{q}(s, \vec{w}_i)
51 | \]
52 | and $G_{h:h} = \hat{q}_(S_h, A_h, \vec{w}_{h-1})$ if $h<T$ while if $h=T$ we have $G_{T-1:T} = R_T$ in the episodic case and $G_{T-1:T} = R_T - \bar{R}_{T-1}$ in the continuing case.\\
53 | 
54 | Note that in each case the value functions are defined with respect to the relevant episodic discounted or continuing average excess return.
55 | 
56 | \subsection{Exercise 11.3 (programming)}
57 | \subsubsection*{Q}
58 | Apply one-step semi-gradient Q-learning to Baird’s counterexample and show empirically that its weights diverge.
59 | 
60 | \subsubsection*{A}
61 | \ProgrammingExercise{}\\
62 | 
63 | \includegraphics[width=\textwidth]{\ExerciseOutput/ex_11_3/bairds_counter_example_q_learning.png}
64 | 
65 | \subsection{Exercise 11.4}
66 | \subsubsection*{Q}
67 | Prove (11.24). Hint: Write the $\bar{\mathrm{RE}}$ as an expectation over possible states s of the expectation of the squared error given that $S_t = s$. Then add and subtract the true value of state $s$ from the error (before squaring), grouping the subtracted true value with the return and the added true value with the estimated value. Then, if you expand the square, the most complex term will end up being zero, leaving you with (11.24).
68 | 
69 | \subsubsection*{A}
70 | Define
71 | \[
72 |     \overline{\mathrm{VE}}(\vec{w}) = \E{}_{s\sim\mu}[v_\pi(s) - \hat{v}(s, \vec{w})]
73 | \]
74 | Now have the return error
75 | \begin{align}
76 |     \overline{\mathrm{RE}} &\doteq \E{}\left[ (G_t - \hat{v}(S_t, \vec{w}))^2 \right] \\
77 |     &= \overline{\mathrm{VE}}(\vec{w}) + \E{}\left[ (G_t - v_\pi(S_t))^2 \right] + 2 \E{}\left[ (G_t - v_\pi(S_t))[v_\pi(S_t) - \hat{v}(S_t, \vec{w})] \right].
78 | \end{align}
79 | The final term is
80 | \begin{align}
81 |     \E{}\left[ (G_t - v_\pi(S_t))[v_\pi(S_t) - \hat{v}(S_t, \vec{w})] \right] &= \E{}_{s \sim \mu}\left\{\E{}\left[ (G_t - v_\pi(s))[v_\pi(s) - \hat{v}(s, \vec{w})] \right]\vert{} s\right\}\\
82 |                   &= 0
83 | \end{align}


--------------------------------------------------------------------------------
/exercises/chapters/chapter12/chapter12.tex:
--------------------------------------------------------------------------------
1 | \input{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction/header}
2 | 
3 | \begin{document}
4 |     \include{chapter12_content}
5 | \end{document}


--------------------------------------------------------------------------------
/exercises/chapters/chapter12/chapter12_content.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter12/chapter12_content.tex


--------------------------------------------------------------------------------
/exercises/chapters/chapter13/chapter13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter13/chapter13.pdf


--------------------------------------------------------------------------------
/exercises/chapters/chapter13/chapter13.tex:
--------------------------------------------------------------------------------
1 | \input{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction/header}
2 | 
3 | \begin{document}
4 |     \include{chapter13_content}
5 | \end{document}
6 | 


--------------------------------------------------------------------------------
/exercises/chapters/chapter13/chapter13_content.tex:
--------------------------------------------------------------------------------
  1 | \section{Policy Gradient Methods}
  2 | 
  3 | \subsection{Exercise 13.1}
  4 | \subsubsection*{Q}
  5 | Use your knowledge of the gridworld and its dynamics to determine an \emph{exact} symbolic expression for the optimal probability of selecting the \texttt{right} action in Example 13.1.
  6 | 
  7 | \subsubsection*{A}
  8 | Define $p = \P{}(\texttt{right})$, so $\P{}(\texttt{left}) = 1-p$. Then, labelling the states 1-3 from right to left (value of terminal state is set to 0), the Bellman equations reduce to
  9 | \begin{align*}
 10 |     v_\pi(1) &= p v_\pi(2) - 1\\
 11 |     v_\pi(2) &= pv_\pi(1) - 1 + (1-p)v_\pi(3) \\
 12 |     v_\pi(3) &= (1-p)v_\pi(2) - 1.
 13 | \end{align*}
 14 | Setting $f(p)$ for the value of the initial state and solving this system gives
 15 | \[
 16 |     f(p) = \frac{p^2 - 2p + 2}{p(1-p)}
 17 | \]
 18 | which attains its maximum at $p = \sqrt{2}(\sqrt{2} - 1)$. (Note that $f$ is defined only on the open interval $(0, 1)$, the performance becomes infinitely bad as $p \to 0, 1$.)
 19 | 
 20 | 
 21 | \subsection{*Exercise 13.2}
 22 | \subsubsection*{Q}
 23 | Generalize the box on page 199, the policy gradient theorem (13.5), the proof of the policy gradient theorem (page 325), and the steps leading to the REINFORCE update equation (13.8), so that (13.8) ends up with a factor of $\gamma^t$ and thus aligns with the general algorithm given in the pseudocode.
 24 | 
 25 | \subsubsection*{A}
 26 | \begin{itemize}
 27 |     \item Generalisation the recursion equation that governs expected time in each state:
 28 |     \begin{align*}
 29 |         \eta(s) &= h(s) + \gamma \sum_{\bar{s}}\eta(\bar{s}) \sum_a \pi(a \vert{} \bar{s}) p(s \vert{} \bar{s}, a)\\
 30 |                 &= h(s) + \gamma \sum_{\bar{s}, a}\pi(a \vert{} \bar{s}) p(s \vert{} \bar{s}, a) + \gamma ^2 \sum_{\bar{s}, a}\pi(a \vert{} \bar{s}) p(s \vert{} \bar{s}, a) \sum_{x, a'}\pi(a' \vert{} x) p(\bar{s} \vert{} x, a) + \cdots
 31 |     \end{align*}
 32 |     This just changes the solution for $\eta(s)$, we still have $\mu(s) = \frac{\eta(s)}{\sum_{s'} \eta(s')}$.
 33 |     \item The generalisation of the proof of the policy gradient theorem comes with the use of the Bellman  equation unfolding for the value function. We therefore arrive at the following gradient:
 34 |     \[
 35 |         \grad_{\vec{\theta}} v_\pi(s) = \sum_{x \in \S{}}\sum_{k = 0}^\infty \P{}(s \to x, k, \pi)\gamma^k\sum_a \grad_{\vec{\theta}} \pi(a \vert{} x) q_\pi(x, a),
 36 |     \]
 37 |     and the theorem follows as before.
 38 |     \item To full incorporate discounting, we need to view it as a form of termination. The policy gradient theorem becomes
 39 |     \[
 40 |         \grad_{\vec{\theta}} J(\vec{\theta}) = \Epi[\gamma_t \sum_a q_\pi(S_t, a) \grad_{\vec{\theta}} \pi(a \vert{} S_t, \vec{\theta})].
 41 |     \]
 42 |     The factor of $\gamma^t$ then follows through when we apply SGD. (It's possible to do some rearranging to prove this relation, but it is not done in the book --  a little unclear!)
 43 | \end{itemize}
 44 | 
 45 | \subsection{Exercise 13.3}
 46 | \subsubsection*{Q}
 47 | In Section 13.1 we considered policy parameterizations using the soft-max in action preferences (13.2) with linear action preferences (13.3). For this parameterization, prove that the eligibility vector is
 48 | \[
 49 |     \grad \log \pi(a \vert{} s, \vec{\theta}) = \vec{x}(s, a) - \sum_b \pi(b \vert{} s, \vec{\theta})\vec{x}(s, b)
 50 | \]
 51 | using the definitions and elementary calculus.
 52 | 
 53 | \subsubsection*{A}
 54 | Have softmax policy
 55 | \[
 56 |     \pi(a \vert{} s, \vec{\theta}) = \frac{\exp(h(s, a, \vec{\theta}))}{\sum_b \exp(h(s, a, \vec{\theta}))}
 57 | \]
 58 | with linear action preferences 
 59 | \[
 60 |     h(s, a, \vec{\theta}) = \vec{\theta}^\top \vec{x}(s, a).
 61 | \]
 62 | The following is then clear:
 63 | \begin{align*}
 64 |     \grad_{\vec{\theta}} \log(\pi) &= \vec{x}(s, a) - \frac{\sum_b \vec{x}(s, b) \exp(\vec{\theta}^\top \vec{x}(s, b))}{\sum_b \exp(\vec{\theta}^\top \vec{x}(s, b))} \\
 65 |                                    &= \vec{x}(s, a) - \sum_b \vec{x}(s, b) \pi(b \vert{} s, \vec{\theta}).
 66 | \end{align*}
 67 | 
 68 | \subsection{Exercise 13.4}
 69 | \subsubsection*{Q}
 70 | Show that for the gaussian policy parameterization (13.19) the eligibility vector has the following two parts:
 71 | 
 72 | \begin{align*}
 73 |     \grad \log \pi(a \vert{} s, \vec{\theta}_\mu) &= \frac{\grad \pi(a \vert{} s, \vec{\theta}_\mu)}{\pi(a \vert{} s, \vec{\theta})} = \frac{1}{\sigma(s, \vec{\theta})^2} (a - \mu(s, \vec{\theta}))\vec{x}_\mu(s)\text{, and}\\
 74 |     \grad \log \pi(a \vert{} s, \vec{\theta}_\sigma) &= \frac{\grad \pi(a \vert{} s, \vec{\theta}_\sigma)}{\pi(a \vert{} s, \vec{\theta})} = \left( \frac{(a - \mu(s, \vec{\theta}))^2}{\sigma(s, \vec{\theta})^2} - 1 \right) \vec{x}_\sigma(s)
 75 | \end{align*}
 76 | 
 77 | \subsubsection*{A}
 78 | Gaussian policy
 79 | \[
 80 |     \pi(a \vert{} s, \vec{\theta}) = \frac{1}{\sigma(s, \vec{\theta}) \sqrt{2 \pi}} \exp\left( - \frac{(a - \mu(s, \vec{\theta}))^2}{2 \sigma(s, \vec{\theta})^2} \right)
 81 | \]
 82 | with the models $\mu(s, \vec{\theta}_\mu) = \vec{\theta}_\mu^\top\vec{x}_\mu(s)$ and $\sigma(s, \vec{\theta}_\sigma) = \exp(\vec{\theta}_\sigma^\top\vec{x}_\sigma(s))$. First,
 83 | \[
 84 |     \log \pi(a \vert{} s, \vec{\theta}) = - \log \sqrt{2 \pi} - \log \sigma - \frac{(a - \mu)^2}{2\sigma^2}
 85 | \]
 86 | so we have 
 87 | \[
 88 |     \grad_{\vec{\theta}_\mu}\log\pi(a \vert{}s, \theta) = \frac{a - \mu}{\sigma^2}\grad_{\vec{\theta}_\mu}\mu(s, \vec{\theta}_\mu) = \frac{a - \mu}{\sigma^2} \vec{x}_\mu(s)
 89 | \]
 90 | and
 91 | \[
 92 |     \grad_{\vec{\theta}_\sigma}\log\pi(a \vert{} s, \vec{\theta}) = - \frac{\grad_{\vec{\theta}_\sigma} \sigma}{\sigma} + \frac{(a - \mu)^2}{\sigma^3}\grad_{\vec{\theta}_\sigma}\sigma = \left( \frac{(a - \mu)^2}{\sigma^2} - 1 \right)\vec{x}_\sigma(s) 
 93 | \]
 94 | because $\grad_{\vec{\theta}_\sigma}\sigma = \vec{x}_\sigma(s) \sigma$.
 95 | 
 96 | 
 97 | \subsection{Exercise 13.5}
 98 | \subsubsection*{Q}
 99 | A \emph{Bernoulli-logistic unit} is a stochastic neuron-like unit used in some ANNs (Section 9.6). Its input at time $t$ is a feature vector $\vec{x}(S_t)$; its output, $A_t$, is a random variable having two values, 0 and 1, with $\text{Pr}\{A_t = 1\} = P_t$ and $\text{Pr}\{A_t = 0\} = 1- P_t$ (the Bernoulli distribution). Let $h(s, 0, \vec{\theta})$ and $h(s, 1, \vec{\theta})$ be the preferences in state $s$ for the unit’s two actions given policy parameter $\vec{\theta}$. Assume that the difference between the action preferences is given by a weighted sum of the unit’s input vector, that is, assume
100 | that $h(s, 1, \vec{\theta}) - h(s, 0, \vec{\theta}) = \vec{\theta}^\top \vec{x}(s)$, where $\vec{\theta}$ is the unit’s weight vector.
101 | 
102 | \begin{enumerate}[(a)]
103 |     \item Show that if the exponential soft-max distribution (13.2) is used to convert action preferences to policies, then $P_t = \pi(1 \vert{}S_t, \vec{\theta}_t) = 1/(1 + \exp(-\vec{\theta}_t^\top\vec{x}(S_t)))$(the logistic function).
104 |     \item What is the Monte-Carlo REINFORCE update of $\vec{\theta}_t$ to $\vec{\theta}_t$ upon receipt of return $G_t$?
105 |     \item Express the eligibility $\grad\log\pi(a\vert{}s,\vec{\theta})$ for a Bernoulli-logistic unit, in terms of $a$, $\vec{x}(s)$ and $\pi(a \vert{} s, \vec{\theta})$ by calculating the gradient.
106 | \end{enumerate}
107 | 
108 | Hint: separately for each action compute the derivative of the logarithm first with respect to $P_t = \pi(1 \vert{}S_t, \vec{\theta}_t)$, combine the two results into one expression that depends on $a$ and $P_t$, and then use the chain rule, noting that the derivative of the logistic function $f(x)$ is $f(x)(1 - f(x))$.
109 | 
110 | \subsubsection*{A}
111 | 
112 | \begin{enumerate}[a)]
113 |     \item $\pi(1 \vert{} S_t, \vec{\theta}_t) = e^{h(s, 1, \vec{\theta}_t)} / (e^{h(s, 1, \vec{\theta}_t)} + e^{h(s, 0, \vec{\theta}_t)}) = 1 /(1 + e^{-\vec{\theta}^\top\vec{x}(s)}) $
114 |     \item $\vec{\theta}_{t+1} = \vec{\theta}_t + \alpha \gamma^t G_t \grad_{\vec{\theta}_t}\log\pi(a \vert{} S_t, \vec{\theta}_t)$ 
115 |     \item Write $\pi(a \vert{} S_t, \vec{\theta}_t) = g((-1)^a \vec{\theta}_t^\top\vec{x}(s))$ where $a \in \{ 0, 1\}$ and $g$ is the sigmoid function $g(t) = 1 / (1 + e^{-t})$. It's then quite easy to see that 
116 |     \[
117 |         \frac{\d{}}{\d{} t} \log g(t) = 1 - g(t)
118 |     \]
119 |     which leads to
120 |     \[
121 |         \grad_{\vec{\theta}} \pi(a \vert{} s, \vec{\theta}) = (-1)^a \vec{x}(s) (1 - \pi(a \vert{} s, \vec{\theta})).
122 |     \]
123 | \end{enumerate}
124 | 


--------------------------------------------------------------------------------
/exercises/chapters/chapter14/chapter14_content.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter14/chapter14_content.tex


--------------------------------------------------------------------------------
/exercises/chapters/chapter15/chapter15_content.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter15/chapter15_content.tex


--------------------------------------------------------------------------------
/exercises/chapters/chapter16/chapter16_content.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter16/chapter16_content.tex


--------------------------------------------------------------------------------
/exercises/chapters/chapter17/chapter17_content.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter17/chapter17_content.tex


--------------------------------------------------------------------------------
/exercises/chapters/chapter2/chapter2_content.tex:
--------------------------------------------------------------------------------
  1 | \section{Multi-armed Bandits}
  2 | \subsection{Exercise 2.1}
  3 | \subsubsection*{Q}
  4 | In $\varepsilon$-greedy action selection, for the case of two actions and $\varepsilon = 0.5$, what is the probability that the greedy action is selected?
  5 | 
  6 | \subsubsection*{A}
  7 | 0.75.
  8 | 
  9 | 
 10 | \subsection{Exercise 2.2: Bandit example}
 11 | \subsubsection*{Q}
 12 | Consider a $k$-armed bandit problem with $k = 4$ actions, denoted 1, 2, 3, and 4. Consider applying to this problem a bandit algorithm using $\varepsilon$-greedy action selection, sample-average action-value estimates, and initial estimates of $Q_1(a) = 0$, for all $a$. Suppose the initial sequence of actions and rewards is $A_1 = 1$, $R_1 =1$, $A_2 =2$, $R_2 =1$, $A_3 =2$, $R_3 =2$, $A_4 =2$, $R_4 =2$, $A_5 =3$, $R_5 =0$. On some of these time steps the $\varepsilon$ case may have occurred, causing an action to be selected at random. On which time steps did this definitely occur? On which time steps could this possibly have occurred?
 13 | 
 14 | \subsubsection*{A}
 15 | $A_2$ and $A_5$ were definitely exploratory. Any of the others \emph{could} have been exploratory.
 16 | 
 17 | 
 18 | \subsection{Exercise 2.3}
 19 | \subsubsection*{Q}
 20 | In the comparison shown in Figure 2.2, which method will perform best in the long run in terms of cumulative reward and probability of selecting the best action? How much better will it be? Express your answer quantitatively.
 21 | 
 22 | \subsubsection*{A}
 23 | The $\varepsilon = 0.01$ will perform better because in both cases as $t \to \infty$ we have $Q_t \to q_*$. The total reward and probability of choosing the optimal action will therefore be 10 times larger in this case than for $\varepsilon = 0.1$.
 24 | 
 25 | 
 26 | \subsection{Exercise 2.4}
 27 | \label{ex:2.4}
 28 | \subsubsection*{Q}
 29 | If the step-size parameters, $\alpha_n$, are not constant, then the estimate $Q_n$ is a weighted average of previously received rewards with a weighting different from that given by (2.6). What is the weighting on each prior reward for the general case, analogous to (2.6), in terms of the sequence of step-size parameters?
 30 | 
 31 | \subsubsection*{A}
 32 | Let $\alpha_0 = 1$, then 
 33 | \begin{equation}
 34 |     Q_{n + 1} = \left(\prod_{i=1}^n (1 - \alpha_i) \right) Q_1 + \sum_{i = 1}^{n}  \alpha_{i} R_{i} \prod_{k = i + 1}^n
 35 | (1 - \alpha_k).
 36 | \end{equation}
 37 | Where $\prod_{i=x}^y f(i) \doteq 1$ if $x > y$.
 38 | 
 39 | \subsection{Exercise 2.5 (programming)}
 40 | \subsubsection*{Q}
 41 | Design and conduct an experiment to demonstrate the difficulties that sample-average methods have for non-stationary problems. Use a modified version of the 10-armed testbed in which all the $q_*(a)$ start out equal and then take independent random walks (say by adding a normally distributed increment with mean zero and standard deviation 0.01 to all the $q_*(a)$ on each step). Prepare plots like Figure 2.2 for an action-value method using sample averages, incrementally computed, and another action-value method using a constant step-size parameter, $\alpha$ = 0.1. Use $\varepsilon$ = 0.1 and longer runs, say of 10,000 steps.
 42 | 
 43 | \subsubsection*{A}
 44 | \ProgrammingExercise
 45 | 
 46 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_output/ex_2_5/learning_curve.png}
 47 | 
 48 | 
 49 | \subsection{Exercise 2.6: Mysterious Spikes}
 50 | \subsubsection*{Q}
 51 | The results shown in Figure 2.3 should be quite reliable because they are averages over 2000 individual, randomly chosen 10-armed bandit tasks. Why, then, are there oscillations and spikes in the early part of the curve for the optimistic method? In other words, what might make this method perform particularly better or worse, on average, on particular early steps?
 52 | 
 53 | \subsubsection*{A}
 54 | At some point after step 10, the agent will find the optimal value. It will then choose this value greedily. The small step-size parameter (small relative to the initialisation value of 5) means that the estimate of the optimal value will converge slowly towards its true value.\\
 55 | 
 56 | It is likely that this true value is less than 5. This means that, due to the small step size, one of the sub-optimal actions will still have a value close to 5. Thus, at some point, the agent begins to act sub-optimally again.
 57 | 
 58 | \subsection{Exercise 2.7: Unbiased Constant-Step-Trick}
 59 | \subsubsection*{Q}
 60 | In most of this chapter we have used sample averages to estimate action values because sample averages do not produce the initial bias that constant step sizes do (see the analysis in (2.6)). However, sample averages are not a completely satisfactory solution because they may perform poorly on non-stationary problems. Is it possible to avoid the bias of constant step sizes while retaining their advantages on non-stationary problems? One way is to use a step size of
 61 | \begin{equation}
 62 |     \beta_t \doteq \alpha / \bar{o}_t,
 63 | \end{equation}
 64 | where $\alpha > 0$ is a conventional constant step size and $\bar{o}_t$ is a trace of one that starts at 0:
 65 | \begin{equation}
 66 |     \bar{o}_{t+1} = \bar{o}_t + \alpha (1 - \bar{o}_t)
 67 | \end{equation}
 68 | for $t \geq 1$ and with $\bar{o}_1 \doteq \alpha$.\\
 69 | 
 70 | Carry out an analysis like that in (2.6) to show that $\beta_t$ is an exponential recency-weighted average \emph{without initial bias}. 
 71 | 
 72 | \subsubsection*{A}
 73 | Consider the answer to \hyperref[ex:2.4]{Exercise 2.4}. There is no dependence of $Q_k$ on $Q_1$ for $k > 1$ since $\beta_1 = 1$. Now it remains to show that the weights in the remaining sum decrease as we look further into the past. That is
 74 | \begin{equation}
 75 |     w_i = \beta_i \prod_{k = i + 1}^{n} (1 - \beta_k)
 76 | \end{equation}
 77 | increases with $i$ for fixed n. For this, observe that
 78 | \begin{equation}
 79 |     \frac{w_{i+1}}{w_i} = \frac{\beta_{i+1}}{\beta_i(1 - \beta_{i + 1})} = \frac{1}{1 - \alpha} > 1
 80 | \end{equation} 
 81 | where we have assumed $\alpha < 1$. If $\alpha = 1$ then $\beta_t = 1 \,\, \forall \, t$.
 82 | 
 83 | \subsection{Exercise 2.8: UCB Spikes}
 84 | \subsubsection*{Q}
 85 | In Figure 2.4 the UCB algorithm shows a distinct spike in performance on the 11th step. Why is this? Note that for your answer to be fully satisfactory it must explain both why the reward increases on the 11th step and why it decreases on the subsequent steps. Hint: if $c = 1$, then the spike is less prominent.
 86 | 
 87 | \subsubsection*{A}
 88 | In the first 10 steps the agent cycles through all of the actions because when $N_t(a) = 0$ then $a$ is considered maximal. On the 11th step the agent will most often then choose greedily. The agent will continue to choose greedily until $\mathrm{ln}(t)$ overtakes $N_t(a)$ for one of the other actions, in which case the agent begins to explore again hence reducing rewards.\\
 89 | 
 90 | Note that, in the long run, $N_t = O(t)$ and $\mathrm{ln}(t) / t \to 1$. So this agent is `asymptotically greedy'.
 91 | 
 92 | 
 93 | \subsection{Exercise 2.9}
 94 | \subsubsection*{Q}
 95 | Show that in the case of two actions, the soft-max distribution is the same as that given by the logistic, or sigmoid, function often used in statistics and artificial neural networks.
 96 | 
 97 | \subsubsection*{A}
 98 | Let the two actions be denoted by 0 and 1. Now
 99 | \begin{equation}
100 |     \P{}(A_t = 1) = \frac{e^{H_t(1)}}{e^{H_t(1)} + e^{H_t(0)}} = \frac{1}{1 + e^{-x}}, 
101 | \end{equation}
102 | where $x = H_t(1) - H_t(0)$ is the relative preference of 1 over 0.
103 | 
104 | \subsection{Exercise 2.10}
105 | \subsubsection*{Q}
106 | Suppose you face a 2-armed bandit task whose true action values change randomly from time step to time step. Specifically, suppose that, for any time step, the true values of actions 1 and 2 are respectively 0.1 and 0.2 with probability 0.5 (case A), and 0.9 and 0.8 with probability 0.5 (case B). If you are not able to tell which case you face at any step, what is the best expectation of success you can achieve and how should you behave to achieve it? Now suppose that on each step you are told whether you are facing case A or case B (although you still don’t know the true action values). This is an associative search task. What is the best expectation of success you can achieve in this task, and how should you behave to achieve it?
107 | 
108 | \subsubsection*{A}
109 | I assume the rewards are stationary.\\
110 | 
111 | One should choose the action with the highest expected reward. In the first case, both action 1 and 2 have expected value of 0.5, so it doesn't matter which you pick.\\
112 | 
113 | In the second case one should run a normal bandit method separately on each colour. The expected reward from identifying the optimal actions in each case is 0.55.
114 | 
115 | \subsection{Exercise 2.11 (programming)}
116 | \subsubsection*{Q}
117 | Make a figure analogous to Figure 2.6 for the non-stationary case outlined in Exercise 2.5. Include the constant-step-size $\varepsilon$-greedy algorithm with $\alpha=0.1$. Use runs of 200,000 steps and, as a performance measure for each algorithm and parameter setting, use the average reward over the last 100,000 steps.
118 | 
119 | \subsubsection*{A}
120 | \ProgrammingExercise
121 | 
122 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_output/ex_2_11/action_values.png}
123 | 
124 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_output/ex_2_11/parameter_study.png}
125 | 
126 | 


--------------------------------------------------------------------------------
/exercises/chapters/chapter4/chapter4_content.tex:
--------------------------------------------------------------------------------
  1 | \section{Dynamic Programming}
  2 | 
  3 | \subsection{Exercise 4.1}
  4 | \subsubsection*{Q}
  5 | In Example 4.1, if $\pi$ is the equiprobable random policy, what is $q_\pi(11, \mathtt{down})$? What is $q_\pi(7, \mathtt{down})$?
  6 | 
  7 | \subsubsection*{A}
  8 | $q_\pi(11, \mathtt{down}) = -1$ since goes to terminal state. $q_\pi(7, \mathtt{down}) = -15$.
  9 | 
 10 | \subsection{Exercise 4.2}
 11 | \subsubsection*{Q}
 12 | In Example 4.1, suppose a new state $15$ is added to the gridworld just below state $13$, and its actions, \texttt{left}, \texttt{up}, \texttt{right}, and \texttt{down}, take the agent to states $12$, $13$, $14$, and $15$, respectively. Assume that the transitions from the original states are unchanged. What, then, is $v_\pi(15)$ for the equiprobable random policy? Now suppose the dynamics of state $13$ are also changed, such that action down from state $13$ takes the agent to the new state $15$. What is $v_\pi(15)$ for the equiprobable random policy in this case?
 13 | 
 14 | \subsubsection*{A}
 15 | $v_\pi(15) = -20$ if dynamics unchanged. If dynamics changed then apparently the state value is the same, but you would need to verify Bellman equations for all states for this.
 16 | 
 17 | \subsection{Exercise 4.3}
 18 | \subsubsection*{Q}
 19 | What are the equations analogous to (4.3), (4.4), and (4.5) for the action-value function $q_\pi$ and its successive approximations by a sequence of functions $q_0, q_1, q_2, \dots$?
 20 | 
 21 | \subsubsection*{A}
 22 | \begin{equation}
 23 |     q_{k+1}(s, a) = \sum_{s', r} p(s', r | s, a)\left[r + \gamma \sum_{a'} \pi(a'|s)q_k(s', a')\right]
 24 | \end{equation}
 25 | 
 26 | \subsection{Exercise 4.4}
 27 | \subsubsection*{Q}
 28 | The policy iteration algorithm on the previous page has a subtle bug in that it may never terminate if the policy continually switches between two or more policies that are equally good. This is ok for pedagogy, but not for actual use. Modify the pseudocode so that convergence is guaranteed.
 29 | 
 30 | \subsubsection*{A}
 31 | One problem is that the $\argmax_a$ has ties broken arbitrarily, this means that the same value function can give rise to different policies.\\
 32 | 
 33 | The way to solve this is to change the algorithm to take the whole set of maximal actions on each step and see if this set is stable and see if the policy is stable with respect to choosing actions from this set.
 34 | 
 35 | 
 36 | \subsection{Exercise 4.5}
 37 | \subsubsection*{Q}
 38 | How would policy iteration be defined for action values? Give a complete algorithm for computing $q_*$, analogous to that on page 80 for computing $q_*$. Please pay special attention to this exercise, because the ideas involved will be used throughout the rest of the book.
 39 | 
 40 | \subsubsection*{A}
 41 | We know that
 42 | \begin{equation}
 43 |     v_\pi(s) = \sum_{a \in \mathcal{A}(s)} \pi(a|s)q_\pi(s, a)
 44 | \end{equation}
 45 | so we know that
 46 | \begin{equation}
 47 |     q_\pi(s, \pi'(s)) \geq \sum_{a \in \mathcal{A}(s)} \pi(a|s)q_\pi(s, a)
 48 | \end{equation}
 49 | if $\pi'$ is greedy with respect to $\pi$. So we know the algorithm still works for action values.\\
 50 | 
 51 | All there is now is to substitute the update for the action-value update and make the policy greedy with respect to the last iteration's action-values. Also need to make sure that the $\argmax_a$ is done consistently.
 52 | 
 53 | \subsection{Exercise 4.6}
 54 | \subsubsection*{Q}
 55 | Suppose you are restricted to considering only policies that are $\varepsilon$-soft, meaning that the probability of selecting each action in each state, $s$, is at least $\varepsilon / |\mathcal{A}(s)|$. Describe qualitatively the changes that would be required in each of the steps 3, 2, and 1, in that order, of the policy iteration algorithm for $v_\pi$ (page 80).
 56 | 
 57 | \subsubsection*{A}
 58 | \begin{enumerate}
 59 |     \item No change (but need policy to be able to be stochastic of course)
 60 |     \item Need to re-write the Bellman update $v(s) \longleftarrow \sum_{a \in \mathcal{A}(s)} \pi(a|s)\sum_{s', r}p(s', r|s, a)\left[ r + \gamma v(s') \right]$
 61 |     \item Construct a greedy policy that puts weight on the greedy actions but is $\varepsilon$-soft. Be careful with the consistency of the $\argmax$.
 62 | \end{enumerate}
 63 | 
 64 | \subsection{Exercise 4.7 (programming): Jack's Car Rental}
 65 | 
 66 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_questions/jacks_car_rental_example.png}
 67 | 
 68 | First we reproduce the original results.
 69 | 
 70 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_output/ex_4_7/jacks_car_rental/jacks_car_rental.png}
 71 | 
 72 | \subsubsection*{Q}
 73 | Write a program for policy iteration and re-solve Jack’s car rental problem with the following changes. One of Jack’s employees at the first location rides a bus home each night and lives near the second location. She is happy to shuttle one car to the second location for free. Each additional car still costs \$$2$, as do all cars moved in the other direction. In addition, Jack has limited parking space at each location. If more than $10$ cars are kept overnight at a location (after any moving of cars), then an additional cost of \$$4$ must be incurred to use a second parking lot (independent of how many cars are kept there). These sorts of nonlinearities and arbitrary dynamics often occur in real problems and cannot easily be handled by optimisation methods other than dynamic programming. To check your program, first replicate the results given for the original problem. If your computer is too slow for the full problem, cut all the numbers of cars in half.
 74 | 
 75 | \subsubsection*{A}
 76 | \ProgrammingExercise\\
 77 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_output/ex_4_7/altered_car_rental.png}
 78 | 
 79 | \subsection{Exercise 4.8}
 80 | \subsubsection*{Q}
 81 | Why does the optimal policy for the gambler’s problem have such a curious form? In particular, for capital of 50 it bets it all on one flip, but for capital of 51 it does not. Why is this a good policy?
 82 | 
 83 | \subsubsection*{A}
 84 | Since the coin is biased against us, we want to minimize the number of flips that we take. At 50 we can win with probability 0.4. At 51 if we bet small then we can get up to 52, but if we lose then we are still only back to 50 and we can again with with probability 0.4. (There is a whole paper on this problem called how to gamble if you must.)
 85 | 
 86 | 
 87 | \subsection{Exercise 4.9 (programming): Gambler's Problem}
 88 | \subsubsection*{Q}
 89 | Implement value iteration for the gambler's problem and solve it for $p_h = 0.25$ and $p_h = 0.55$. In programming, you may find it convenient to introduce two dummy states corresponding to termination with capital of $0$ and $100$, giving them values of $0$ and $1$ respectively. Show your results graphically, as in Figure 4.3. Are your results stable as $\theta \to 0$?
 90 | 
 91 | \subsubsection*{A}
 92 | \ProgrammingExercise\\
 93 | 
 94 | The process was stable as $\theta \to 0$ for $\P{}(\mathtt{win}) < 0.5$.
 95 | 
 96 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_output/ex_4_9/values_and_policy_pwin_25.eps}
 97 | 
 98 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_output/ex_4_9/values_and_policy_pwin_55.eps}
 99 | 
100 | \subsection{Exercise 4.10}
101 | \subsubsection*{Q}
102 | What is the analog of the value iteration update (4.10) for action values, $q_{k+1}(s, a)$?
103 | 
104 | \subsubsection*{A}
105 | \begin{equation}
106 |     q_{k+1} = \max_{a'} \sum_{s', r} p(s', r| s, a)\left[r + \gamma q_k(s', a')\right]
107 | \end{equation}
108 | 
109 | 


--------------------------------------------------------------------------------
/exercises/chapters/chapter5/chapter5_content.tex:
--------------------------------------------------------------------------------
  1 | \section{Monte-Carlo Methods}
  2 | \subsection{Exercise 5.1}
  3 | \subsubsection*{Q}
  4 | Consider the diagrams on the right in Figure 5.1. Why does the estimated value function jump up for the last two rows in the rear? Why does it drop off for the whole last row on the left? Why are the frontmost values higher in the upper diagrams than in the lower?
  5 | 
  6 | \subsubsection*{A}
  7 | \begin{itemize}
  8 |     \item Policy is to hit unless $S \geq 20$. So you run a rik of going bust if you have 12-19, but you most likely win when you stick on 20 or 21
  9 |     \item Drops off because dealer has a usable ace
 10 |     \item Frontmost higher because you're less likely to go bust, but you still might get to 20 or 21 ($\pi$ always hits here).
 11 | \end{itemize}
 12 | 
 13 | \subsection{Exercise 5.2}
 14 | \subsubsection*{Q}
 15 | Suppose every-visit MC was used instead of first-visit MC on the blackjack task. Would you expect the results to be very different? Why or why not?
 16 | 
 17 | \subsubsection*{A}
 18 | Results would be the same because this game is memoryless (cards are drawn with replacement).
 19 | 
 20 | \subsection{Exercise 5.3}
 21 | \subsubsection*{Q}
 22 | What is the backup diagram for Monte Carlo estimation of $q_\pi$?
 23 | 
 24 | \subsubsection*{A}
 25 | The same as the one shown in the book for state valus, only we have state-action pairs instead of states.
 26 | 
 27 | \subsection{Exercise 5.4}
 28 | \subsubsection*{Q}
 29 | What is the equation analogous to (5.6) for \emph{action} values $Q(s, a)$ instead of state values $V(s)$, again given returns generated using $b$?
 30 | \subsubsection*{A}
 31 | We condition on taking action $a$ in state $s$.
 32 | \[
 33 |     q_\pi(s, a) = \Epi{}[\rho_{t+1:T-1} G_t | S_t = s, A_t = s]
 34 | \]
 35 | with returns generated from $b$. We estimate this quantity by
 36 | \[
 37 |     Q(s, a) = \frac{\sum_{t \in \mathcal{T}(s, a)} \rho_{t+1:T-1} G_t}{\sum_{t \in \mathcal{T}(s, a)} \rho_{t+1:T-1}}
 38 | \]
 39 | where $\mathcal{T}(s, a)$ now contains timestamps of visits to state-action pairs.
 40 | 
 41 | \subsection{Exercise 5.5}
 42 | \subsubsection*{Q}
 43 | In learning curves such as those shown in Figure 5.3 error generally decreases with training, as indeed happened for the ordinary importance-sampling method. But for the weighted importance-sampling method error first increased and then decreased. Why do you think this happened?
 44 | 
 45 | \subsubsection*{A}
 46 | When there are fewer episodes the importance sampling ratios will be zero with higher probability since the behaviour policy will stick on values smaller than 20 (since it is random). Zero happens to be close to $v_\pi(s)$.\\
 47 | 
 48 | This effect lessens as we get more diversity in the episode trajectories.\\
 49 | 
 50 | Then after this the error reduces because the variance in the estimator reduces.
 51 | 
 52 | \subsection{Exercise 5.6}
 53 | \subsubsection*{Q}
 54 | The results with Example 5.5 and shown in Figure 5.4 used a first-visit MC method. Suppose that instead an every-visit MC method was used on the same problem. Would the variance of the estimator still be infinite? Why or why not?
 55 | \subsubsection*{A}
 56 | Yes, all terms in the sum are $\geq 0$ and there woud just be more of them.
 57 | 
 58 | \subsection{Exercise 5.7}
 59 | \subsubsection*{Q}
 60 | Modify the algorithm for first-visit MC policy evaluation (Section 5.1) to use the incremental implementation for sample averages described in Section 2.4.
 61 | 
 62 | \subsubsection*{A}
 63 | Algo is the same apart from 
 64 | \begin{itemize}
 65 |     \item Initialise $V(s) = 0 \quad \forall s \in S$
 66 |     \item Don't need \emph{Returns(s)} lists.
 67 |     \item Remove the last two lines and put in \[ V(S_t) \leftarrow V(S_t) + \frac{1}{T- t}[ G_t - V(S_t) ] \]
 68 | \end{itemize}
 69 | 
 70 | \subsection{Exercise 5.8}
 71 | \subsubsection*{Q}
 72 | Derive the weighted-average update rule (5.8) from (5.7). Follow the pattern of the derivation of the unweighted rule (2.3).
 73 | 
 74 | \subsubsection*{A}
 75 | Have $C_0 = 0$, $C_n = \sum_{k = 1}^n W_k$ and 
 76 | \[
 77 |     V_{n+1} = \frac{\sum_{k = 1}^n W_kG_k}{C_n}.
 78 | \]
 79 | Therefore,
 80 | \begin{align}
 81 |     C_n V_{n+1} &= \sum_{k+1}^{n-1}W_kG_k + W_kG_k\\
 82 |                 &= C_{n-1}V_n + W_nG_n \\
 83 |                 &= (C_n - W_n)V_n + W_nG_n.
 84 | \end{align}
 85 | Finally
 86 | \[
 87 |     V_{n+1} = V_n + \frac{W_n}{C_n}[G_n - V_n].
 88 | \]
 89 | 
 90 | \subsection{Exercise 5.9}
 91 | \subsubsection*{Q}
 92 | In the boxed algorithm for off-policy MC control, you may have been expecting the W update to have involved the importance-sampling ratio $\pi(A_t|S_t)$, but instead it involves $1/b(A_t|S_t)$. Why is this nevertheless correct?
 93 | 
 94 | \subsubsection*{A}
 95 | $\pi$ is greedy, so 
 96 | \[
 97 |     \pi(a | s) = \mathds{1}\{a = \argmax_{a'} Q(s, a')\}.
 98 | \]
 99 | 
100 | \subsection{Exercise 5.10 (programming): Racetrack}
101 | \subsubsection*{Q}
102 | Consider driving a race car around a turn like those shown in Figure 5.5. You want to go as fast as possible, but not so fast as to run off the track. In our simplified racetrack, the car is at one of a discrete set of grid positions, the cells in the diagram. The velocity is also discrete, a number of grid cells moved horizontally and vertically per time step. The actions are increments to the velocity components. Each may be changed by +1, -1, or 0 in each step, for a total of nine ($3 \times 3$) actions. Both velocity components are restricted to be nonnegative and less than 5, and they cannot both be zero except at the starting line. Each episode begins in one of the randomly selected start states with both velocity components zero and ends when the car crosses the finish line. The rewards are -1 for each step until the car crosses the finish line. If the car hits the track boundary, it is moved back to a random position on the starting line, both velocity components are reduced to zero, and the episode continues. Before updating the car’s location at each time step, check to see if the projected path of the car intersects the track boundary. If it intersects the finish line, the episode ends; if it intersects anywhere else, the car is considered to have hit the track boundary and is sent back to the starting line. To make the task more challenging, with probability 0.1 at each time step the velocity increments are both zero, independently of the intended increments. Apply a Monte Carlo control method to this task to compute the optimal policy from each starting state. Exhibit several trajectories following the optimal policy (but turn the noise off for these trajectories).
103 | \subsubsection*{A}
104 | \ProgrammingExercise\\
105 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_output/ex_5_10/track_1_trajectories.eps}
106 | 
107 | \includegraphics[width=\textwidth]{\ProjectDir/data/exercise_output/ex_5_10/track_2_sample_trajectory.eps}
108 | 
109 | \subsection{*Exercise 5.11}
110 | \subsubsection*{Q}
111 | Modify the algorithm for off-policy Monte Carlo control (page 110) to use the idea of the truncated weighted-average estimator (5.10). Note that you will first need to convert this equation to action values.
112 | 
113 | \subsubsection*{A}
114 | ... 


--------------------------------------------------------------------------------
/exercises/chapters/chapter7/chapter7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter7/chapter7.pdf


--------------------------------------------------------------------------------
/exercises/chapters/chapter7/chapter7.tex:
--------------------------------------------------------------------------------
1 | \input{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction/header}
2 | 
3 | \begin{document}
4 |     \include{chapter7_content}
5 | \end{document}


--------------------------------------------------------------------------------
/exercises/chapters/chapter8/chapter8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter8/chapter8.pdf


--------------------------------------------------------------------------------
/exercises/chapters/chapter8/chapter8.tex:
--------------------------------------------------------------------------------
1 | \input{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction/header}
2 | 
3 | \begin{document}
4 |     \include{chapter8_content}
5 | \end{document}


--------------------------------------------------------------------------------
/exercises/chapters/chapter8/chapter8_content.tex:
--------------------------------------------------------------------------------
 1 | \section{Planning and Learning with Tabular Methods}
 2 | 
 3 | \subsection{Exercise 8.1}
 4 | \subsubsection*{Q}
 5 | The non-planning method looks particularly poor in Figure 8.3 because it is a one-step method; a method using multi-step bootstrapping would do better. Do you think one of the multi-step bootstrapping methods from Chapter 7 could do as well as the Dyna method? Explain why or why not.
 6 | \subsubsection*{A}
 7 | Dyna updates using all past experience so quickly synthesises this into an optimal trajectory. $n$-step bootstrapping might be slower because it only states visited in the last $n$ steps.
 8 | 
 9 | \subsection{Exercise 8.2}
10 | \subsubsection*{Q}
11 | Why did the Dyna agent with exploration bonus, Dyna-Q+, perform better in the first phase as well as in the second phase of the blocking and shortcut experiments?
12 | \subsubsection*{A}
13 | Increased exploration means Dyna-Q+ finds the optimal policy quicker than Dyna-Q. Dyna-Q may find a trajectory that works but is suboptimal and then have to wait a long time for it to take enough exploratory actions to find an optimal policy.
14 | 
15 | \subsection{Exercise 8.3}
16 | \subsubsection*{Q}
17 | Careful inspection of Figure 8.5 reveals that the difference between Dyna-Q+ and Dyna-Q narrowed slightly over the first part of the experiment. What is the reason for this?
18 | 
19 | \subsubsection*{A}
20 | Dyna-Q+ will take suboptimal actions in order to explore (when $\tau$ gets large). Dyna-Q will not do this so has better asymptotic performance.
21 | 
22 | \subsection{Exercise 8.4 (programming)}
23 | \subsubsection*{Q}
24 | The exploration bonus described above actually changes the estimated values of states and actions. Is this necessary? Suppose the bonus $\kappa \sqrt{\tau}$ was used not in updates, but solely in action selection. That is, suppose the action selected was always that for which $Q(S_t, a) + \kappa \sqrt{\tau(S_t, a)}$ was maximal. Carry out a gridworld experiment that tests and illustrates the strengths and weaknesses of this alternate approach.
25 | 
26 | \subsubsection*{A}
27 | \ProgrammingExercise{}\\
28 | 
29 | The change means that exploration only takes into account the next action, not whole trajectories. In the Dyna-Q+ algorithm can explore whole new paths through the planning stage.\\
30 | 
31 | This is backed up by the results. The altered Dyna-Q+ learns the new path slower because it can't plan new trajectories.
32 | \\
33 | \includegraphics[width=\textwidth]{\ExerciseOutput/ex_8_4/dyna_q_comparison.png}
34 | 
35 | \subsection{Exercise 8.5}
36 | \subsubsection*{Q}
37 | How might the tabular Dyna-Q algorithm shown on page 164 be modified to handle stochastic environments? How might this modification perform poorly on changing environments such as considered in this section? How could the algorithm be modified to handle stochastic environments \emph{and} changing environments?
38 | \subsubsection*{A}
39 | \begin{itemize}
40 |     \item You could take frequency of occurrences of transitions to estimate the transition probability for the model. (These would be the MLE estimates.)
41 |     \item Make expected updates when planning.
42 |     \item This would would present an issue if the environment changed since the changes would just be reflected in changing transition probabilities (which could take a long time to reflect the change in the environment.)
43 |     \item A solution to this could be to use an exploration bonus to encourage the agent to continue to select various states and keep the model up to date.
44 |     \item A better solution would be to add some notion of confidence to the model estimates of the transition probabilities. Could model the probabilities like 
45 |     \[ 
46 |         p(s, a, s') = \hat{p}(s, a, s')(1 - \sigma(\tau)) + \sigma(\tau)e,
47 |     \]
48 |     where $\hat{p}$ is the MLE estimate of the probabilities, $e$ is the equiprobable estimate and $\sigma(\tau)$ is a sigmoid of the time since the state-action pair $(s, a)$ was list visited.
49 | \end{itemize}
50 | 
51 | \subsection{Exercise 8.6}
52 | \subsubsection*{Q}
53 | The analysis above assumed that all of the \emph{b} possible next states were equally likely to occur. Suppose instead that the distribution was highly skewed, that some of the \emph{b} states were much more likely to occur than most. Would this strengthen or weaken the case for sample updates over expected updates? Support your answer.
54 | \subsubsection*{A}
55 | If the transition probabilities are skewed then the expected updates perform the same while sample updates get accurate on the most probably outcomes very quickly. This strengthens the case for sample updates.
56 | 
57 | 
58 | \subsection{Exercise 8.7}
59 | \subsubsection*{Q}
60 | Some of the graphs in Figure 8.8 seem to be scalloped in their early portions, particularly the upper graph for $b = 1$ and the uniform distribution. Why do you think this is? What aspects of the data shown support your hypothesis?
61 | \subsubsection*{A}
62 | In the case of the uniform distribution of updates and $b=1$, the start state is visited roughly once every $|\mathcal{S}|$ updates. When this happens, the action values of the neighbourhood of the start state are updated and their undergo a greater change than when the states that are not in the neighbourhood are the start state are updated. Thus, when the policy is evaluated, the value of the start state changes a lot if the start state has been visited recently, and not so much otherwise (since the change comes from values backed up from states far away from the start state).\\
63 | 
64 | In the on-policy case, the start state is visited much more often (on average more than once every 10 updates, since $\P{}(\texttt{terminate}) = 0.1$) so it does not exhibit this behaviour. When $b$ is larger there are more connections between states, so the neighbourhood of the start state is larger, so this feature is also reduced.
65 | 
66 | \subsection{Exercise 8.8 (programming)}
67 | \subsubsection*{Q}
68 | Replicate the experiment whose results are shown in the lower part of Figure 8.8, then try the same experiment but with $b = 3$. Discuss the meaning of your results.
69 | 
70 | \subsubsection*{A}
71 | \ProgrammingExercise{}\\
72 | \mbox{}\\
73 | Charts show averages of 200 runs. We see that in the $b=3$ case the uniformly distributed updates overtakes the on-policy updates much quicker. This is due to the greater complexity of the state-space (the number of states on which the value of the starting state depends is exponential in $b$), of which the on-policy updates neglects large portions. \\
74 | \includegraphics[width=\textwidth]{\ExerciseOutput/ex_8_8/update_distribution_comparison_10000_1.png}
75 | \mbox{}\\
76 | \includegraphics[width=\textwidth]{\ExerciseOutput/ex_8_8/update_distribution_comparison_10000_3.png}
77 | 
78 | 


--------------------------------------------------------------------------------
/exercises/chapters/chapter9/chapter9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/chapters/chapter9/chapter9.pdf


--------------------------------------------------------------------------------
/exercises/chapters/chapter9/chapter9.tex:
--------------------------------------------------------------------------------
1 | \input{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction/header}
2 | 
3 | \begin{document}
4 |     \include{chapter9_content}
5 | \end{document}


--------------------------------------------------------------------------------
/exercises/chapters/chapter9/chapter9_content.tex:
--------------------------------------------------------------------------------
 1 | \section{On-policy Prediction with Approximation}
 2 | 
 3 | \subsection{Exercise 9.1}
 4 | \subsubsection*{Q}
 5 | Show that tabular methods such as presented in Part I of this book are a special case of linear function approximation. What would the feature vectors be?
 6 | \subsubsection*{A}
 7 | Write $\hat{V}(s, \vec{w}) = w$ and we get that $\grad_{\vec{w}} \hat{V}(s, \vec{w}) = 1$ so we return to tabular TD learning. In this case the features are $x(s) = 1$ $\forall s \in \mathcal{S}$.
 8 | 
 9 | \subsection{Exercise 9.2}
10 | \subsubsection*{Q}
11 | Why does (9.17) define $(n+1)^k$ distinct features for dimension $k$?
12 | \subsubsection*{A}
13 | Each of the $k$ terms can independently have one of $n+1$ exponents, hence the total number of features is $(n+1)^k$.
14 | 
15 | \subsection{Exercise 9.3}
16 | \subsubsection*{Q}
17 | What $n$ and $c_{i,j}$ produce the feature vectors $\vec{x}(s) = (1, s_1, s_2, s_1s_2, s_1^2, s_2^2, s_1s_2^2, s_1^2 s_2, s_1^2 s_2^2)$?
18 | \subsubsection*{A}
19 | $n=2$ and $c_i, j = C_{ij}$ where
20 | \[
21 | C =
22 | \begin{pmatrix}
23 |     0 & 0 \\
24 |     1 & 0 \\
25 |     0 & 1 \\
26 |     1 & 1 \\
27 |     2 & 0 \\
28 |     0 & 2 \\
29 |     1 & 2 \\
30 |     2 & 1 \\
31 |     2 & 2
32 | \end{pmatrix}
33 | \]
34 | 
35 | \subsection{Exercise 9.4}
36 | \subsubsection*{Q}
37 | Suppose we believe that one of two state dimensions is more likely to have an effect on the value function than is the other, that generalization should be primarily across this dimension rather than along it. What kind of tilings could be used to take advantage of this prior knowledge?
38 | \subsubsection*{A}
39 | Tiles that are thin along the dimension of interest and long across it. Rectangles, for instance.
40 | 
41 | \subsection{Exercise 9.5}
42 | \subsubsection*{Q}
43 | Suppose you are using tile coding to transform a seven-dimensional continuous state space into binary feature vectors to estimate a state value function $\hat{v}(s, \vec{w}) \approx v_\pi(s)$. You believe that the dimensions do not interact strongly, so you decide to use eight tilings of each dimension separately (stripe tilings), for $7 \times 8 = 56$ tilings. In addition, in case there are some pairwise interactions between the dimensions, you also take all $\binom{7}{2} = 21$ pairs of dimensions and tile each pair conjunctively with rectangular tiles. You make two tilings for each pair of dimensions, making a grand total of $21 \times 2 + 56 = 98$ tilings. Given these feature vectors, you suspect that you still have to average out some noise, so you decide that you want learning to be gradual, taking about 10 presentations with the same feature vector before learning nears its asymptote. What step-size parameter $\alpha$ should you use? Why?
44 | 
45 | \subsubsection*{A}
46 | Each tiling is a partition, so each tiling has exactly one tile activated per state. This means that in our case the number of features is 98. We consider each of these equally likely because we are uninformed. We therefore take
47 | \[
48 | \alpha = \frac{1}{10 \times 98} = \frac{1}{980}.
49 | \]
50 | So that on average we see each feature 10 times before asymptote. [Note that this assumes a constant target.]
51 | 


--------------------------------------------------------------------------------
/exercises/exercises.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/exercises/exercises.pdf


--------------------------------------------------------------------------------
/exercises/exercises.tex:
--------------------------------------------------------------------------------
 1 | \input{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction/header}
 2 | 
 3 | \begin{document}
 4 | 
 5 | \pagenumbering{gobble}
 6 | % FRONT MATTER
 7 | 
 8 | {\huge Exercises} \hfill {\huge Reinforcement Learning: An Introduction}\\
 9 | \Rule\\
10 | \tableofcontents
11 | \mbox{}\\
12 | \Rule
13 | \mbox{}\\
14 | \clearpage
15 | 
16 | \vfill
17 | \begin{center}
18 |     Code for exercises can be found at \href{\RepoAddress{}}{\RepoName{}}\\[2cm]
19 |     Note that equation numbers in questions will refer to the original text.
20 | \end{center}
21 | \vfill
22 | \clearpage
23 | 
24 | % END OF FRONT MATTER
25 | \pagenumbering{arabic}
26 | 
27 | % You can use the exam package to make some of this a bit easier.
28 | \setcounter{secnumdepth}{1}
29 | 
30 | \include{chapters/chapter1/chapter1_content}
31 | \include{chapters/chapter2/chapter2_content}
32 | \include{chapters/chapter3/chapter3_content}
33 | \include{chapters/chapter4/chapter4_content}
34 | \include{chapters/chapter5/chapter5_content}
35 | \include{chapters/chapter6/chapter6_content}
36 | \include{chapters/chapter7/chapter7_content}
37 | \include{chapters/chapter8/chapter8_content}
38 | \include{chapters/chapter9/chapter9_content}
39 | \include{chapters/chapter10/chapter10_content}
40 | \include{chapters/chapter11/chapter11_content}
41 | \include{chapters/chapter12/chapter12_content}
42 | \include{chapters/chapter13/chapter13_content}
43 | \include{chapters/chapter14/chapter14_content}
44 | \include{chapters/chapter15/chapter15_content}
45 | \include{chapters/chapter16/chapter16_content}
46 | \include{chapters/chapter17/chapter17_content}
47 | 
48 | \end{document}
49 | 


--------------------------------------------------------------------------------
/header.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[a4paper, oneside, 11pt]{article}
 2 | \usepackage[margin=2.5cm]{geometry}
 3 | \usepackage{amssymb}
 4 | \usepackage{amsmath}
 5 | \usepackage{graphicx}
 6 | \usepackage{bm}
 7 | \usepackage{mathtools}
 8 | \usepackage{setspace}
 9 | \usepackage{dsfont}
10 | \usepackage{xcolor}
11 | \usepackage{upquote}
12 | \usepackage[utf8]{inputenc}
13 | \usepackage[colorlinks=true, linkcolor=black, urlcolor=blue, citecolor=black]{hyperref}
14 | 
15 | \usepackage[shortlabels]{enumitem}
16 | 
17 | \setlength{\parindent}{0cm}
18 | 
19 | \newcommand\Rule{\noindent\makebox[\textwidth]{\rule{\textwidth}{0.5pt}}}
20 | 
21 | \newcommand\argmax{\operatorname*{argmax}}
22 | \newcommand\argmin{\operatorname*{argmin}}
23 | 
24 | \renewcommand{\vec}[1]{\boldsymbol{#1}}
25 | 
26 | \newcommand{\R}{\mathbb{R}}
27 | \newcommand\Epi{\mathbb{E}_{\pi}}
28 | \renewcommand\P{\mathbb{P}}
29 | \newcommand\E{\mathbb{E}}
30 | \newcommand{\VE}{\overline{\mathrm{VE}}}
31 | 
32 | \renewcommand{\S}{\mathcal{S}}
33 | \newcommand{\A}{\mathcal{A}}
34 | \newcommand{\grad}{\nabla}
35 | \renewcommand{\d}{\mathrm{d}}
36 | 
37 | 
38 | \renewcommand{\familydefault}{\sfdefault}
39 | 
40 | 
41 | \newcommand\RepoAddress{https://github.com/brynhayder/reinforcement_learning_an_introduction}
42 | \newcommand\RepoName{github.com/brynhayder/reinforcement\_learning\_an\_introduction}
43 | 
44 | \newcommand\ProjectDir{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction}
45 | \newcommand\NotesImages{\ProjectDir/data/notes_images}
46 | \newcommand\ExerciseOutput{\ProjectDir/data/exercise_output}
47 | 
48 | \newcommand\ProgrammingExercise{This is a programming exercise. For the relevant code please see \href{\RepoAddress{}}{the repo}.}
49 | 
50 | 


--------------------------------------------------------------------------------
/notes/chapters/chapter1/chapter1_content.tex:
--------------------------------------------------------------------------------
 1 | \section{Introduction}
 2 | Reinforcement learning is about how an agent can learn to interact with its environment. Reinforcement learning uses the formal framework of Markov decision processes to define the interaction between a learning agent and its environment in terms of states, actions, and rewards.
 3 | 
 4 | \setcounter{subsection}{2}
 5 | \subsection{Elements of Reinforcement Learning}
 6 | \begin{description}
 7 |      \item[Policy] defines the way that an agent acts, it is a mapping from perceived states of the world to actions. It may be stochastic.
 8 |      \item[Reward] defines the goal of the problem. A number given to the agent as a (possibly stochastic) function of the state of the environment and the action taken.
 9 |      \item[Value function] specifies what is good in the long run, essentially to maximise the expected reward. The central role of value estimation is arguably the most important thing that has been learned about reinforcement learning over the last six decades.
10 |      \item[Model] mimics the environment to facilitate planning. Not all reinforcement learning algorithms have a model (if they don't then they can't plan, i.e. must use trial and error, and are called model free).
11 | \end{description}
12 | 


--------------------------------------------------------------------------------
/notes/chapters/chapter10/chapter10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter10/chapter10.pdf


--------------------------------------------------------------------------------
/notes/chapters/chapter10/chapter10.tex:
--------------------------------------------------------------------------------
1 | \input{../../../header}
2 | 
3 | \begin{document}
4 |     \include{chapter10_content}
5 | \end{document}


--------------------------------------------------------------------------------
/notes/chapters/chapter10/chapter10_content.tex:
--------------------------------------------------------------------------------
  1 | \section{On-policy Control with Approximation}
  2 | We consider attempts to solve the control problem using parametrised function approximation to estimate action-values. We consider only the on-policy case for now.
  3 | 
  4 | \subsection{Episodic Semi-gradient Control}
  5 | Extension of the semi-gradient update rules to action-values is straightforward
  6 | \begin{equation}
  7 |     \vec{w}_{t+1} = \vec{w}_t + \alpha \left[ U_t - \hat{q}(S_t, A_t, \vec{w}_t) \right] \grad_{\vec{w}_t} \hat{q}(S_t, A_t, \vec{w}_t)
  8 | \end{equation}
  9 | where $U_t$ is the update target at time $t$. For example, one-step Sarsa has the update target is
 10 | \[
 11 |     U_t = R_{t+1} + \gamma \hat{q}(S_{t+1}, A_{t+1}, \vec{w}_t).
 12 | \]
 13 | We call this method \emph{episodic semi-gradient one-step Sarsa}. For a constant policy, this method converges in the same with as TD(0), with a similar kind of error bound.\\
 14 | 
 15 | In order to form control methods, we must couple the prediction ideas developed in the previous chapter with methods for policy improvement. Policy improvement methods for continuous actions or actions from large discrete spaces are an active area of research, with no clear resolution. For actions drawn from smaller discrete sets, we can use the same idea as we have before, which is to compute action values and then take an $\varepsilon$-greedy action selection. Episodic semi-gradient sarsa can be used to estimate the optimal action-values as in the box below.\\
 16 | 
 17 | \includegraphics[width=\textwidth]{\NotesImages/episodic_semi_gradient_sarsa.png}\\
 18 | 
 19 | \subsection{Semi-gradient $n$-step Sarsa}
 20 | We can use an $n$-step version of the episodic Sarsa that we defined above by incorporating the bootstrapped $n$-step return 
 21 | \begin{equation}
 22 |     G_{t:t+n} \doteq \sum_{i=1}^{n-1} \gamma^i R_{i+1} + \gamma^n \hat{q}(S_{t+n}, A_{t+n}, W_{t+n-1})
 23 | \end{equation}
 24 | where $G_{t:t+n} = G_{t}$ if $t + n \geq T$, as usual. This update target is used in the pseudocode in the box below. As we have seen before, performance is generally best with amn intermediate value of $n$. \\
 25 | 
 26 | \includegraphics[width=\textwidth]{\NotesImages/n_step_episodic_semi_gradient_sarsa.png}\\
 27 | 
 28 | 
 29 | \subsection{Average Reward: A New Problem Setting for Continuing Tasks}
 30 | We introduce a third classical setting for formulating the goal in Markov decision problems (MDPs) (to go along with episodic and continuing). This new setting is called the \emph{average reward setting}. This setting applies to continuing problems with no start or end state, but also no discounting. (Later we will see that the lack of a start state introduces a symmetry that makes discounting with function approximation pointless.)\\
 31 | 
 32 | In the average reward setting, the ordering of policies is (most often) defined with respect to the \emph{average reward}  while following the policy
 33 | 
 34 | \begin{align}
 35 |     r(\pi) &\doteq \lim_{h\to\infty} \frac1h \sum_{t=1}^{h} \E{}[R_{t} \vert{} S_0, A_{0:t-1} \sim \pi]\\
 36 |              &= \lim_{t\to\infty} \E{}[R_{t} \vert{} S_0, A_{0:t-1} \sim \pi] \\
 37 |              &= \sum_s \mu_\pi(s) \sum_a \pi(a \vert{} s) \sum_{s', r} p(s', r \vert{} s, a) r.
 38 | \end{align}
 39 | 
 40 | We will consider policies that attain the maximal value of $r(\pi)$ to be optimal (though there are apparently some subtly distinctions here that are not gone into).\\
 41 | 
 42 | The distribution $\mu_\pi(s)$ is the steady-state distribution defined by
 43 | \begin{equation}
 44 |     \mu_\pi(s) \doteq \lim_{t\to \infty} \P{} (S_t = s \vert{} A_{0:t-1} \sim \pi)
 45 | \end{equation}
 46 | which we assume to exist for any $\pi$ and to be independent of the starting state $S_0$. This assumption is known as \emph{ergodicity}, and it means that the long run expectation of being in a state depends only on the policy and MDP transition probabilities -- not on the start state. The steady-state distribution has the property that it is invariant under actions taken by $\pi$, in the sense that the following holds
 47 | \[
 48 |     \sum_s \mu_\pi(s) \sum_a \pi(a \vert{} s) p(s' \vert{} s, a) = \mu_\pi(s').
 49 | \]\\
 50 | 
 51 | In the average-reward setting we define returns in terms of the difference between the reward and the expected reward for the policy
 52 | \begin{equation}
 53 |     G_t \doteq \sum_{i \geq t} \left(R_{i+1} - r(\pi)\right)
 54 | \end{equation}
 55 | we call this quantity the \emph{differential return} and the corresponding value functions (defined in the same way, just with this return instead) \emph{differential value functions}. These new value functions also have Bellman equations:
 56 | 
 57 | \begin{align}
 58 |     v_\pi(s) &= \sum_a \pi(a \vert{} s) \sum_{s', r} p(s', r \vert{} s, a) \left[ r - r(\pi) + v_\pi(s')\right] \\
 59 |     q_\pi(s, a) &= \sum_{s', r} p(s', r \vert{} s, a) \left[ r - r(\pi) + \sum_{a'} \pi(a' \vert{} s') q_\pi(s', a')\right] \\
 60 |     v_*(s) &= \max_a \sum_{s', r} p(s', r \vert{} s, a) \left[ r - r(\pi) + v_*(s')\right] \\
 61 |     q_\pi(s, a) &= \sum_{s', r} p(s', r \vert{} s, a) \left[ r - r(\pi) + \max_{a'} q_*(s', a')\right].
 62 | \end{align}
 63 | We also have differential forms of the TD errors, where $\bar{R}_t$ is the estimate of $r(\pi)$ at $t$,
 64 | \begin{align}
 65 |     \delta_t &\doteq R_{t+1} - \bar{R}_{t+1} + \hat{v}(S_{t+1}, \vec{w}_t) - \hat{v}(S_t, \vec{w}_t)\\
 66 |     \delta_t &\doteq R_{t+1} - \bar{R}_{t+1} + \hat{q}(S_{t+1}, A_{t+1}, \vec{w}_t) - \hat{q}(S_t, A_t, \vec{w}_t).
 67 | \end{align}\\
 68 | 
 69 | Many of the previous algorithms and theoretical results carry over to this new setting without change. For instance, the update for the semi-gradient Sarsa is defined in the same way just with the new TD error, corresponding pseudocode given in the box below.\\
 70 | 
 71 | \includegraphics[width=\textwidth]{\NotesImages/differential_semi_gradient_sarsa.png}\\
 72 | 
 73 | \subsection{Deprecating the Discounted Setting}
 74 | Suppose we want to optimise the discounted value function $v_\pi^\gamma(s)$ over the on-policy distribution, we would choose an objective $J(\pi)$ with
 75 | \begin{align}
 76 |     J(\pi) &\doteq \sum_s \mu_\pi(s) v_\pi^\gamma(s) \\
 77 |            &= \sum_s \mu_\pi(s) \sum_a \pi(a \vert{} s) \sum_{s', r} p(s', r \vert{} s, a) \left[ r + \gamma v_\pi^\gamma(s')\right] \\
 78 |            &= r(\pi) + \sum_s \mu_\pi(s) \sum_a \pi(a \vert{} s) \sum_{s', r} p(s', r \vert{}  s, a) \gamma v_\pi^\gamma(s') \\
 79 |            &= r(\pi) + \gamma \sum_{s'} v_\pi^\gamma(s') \sum_{s} \mu_\pi(s) \sum_a \pi(a \vert{} s)p(s', \vert{}  s, a) \\
 80 |            &= r(\pi) + \gamma \sum_{s'} v_\pi^\gamma(s') \mu_\pi(s') \\
 81 |            &= r(\pi) + \gamma J(\pi) \\
 82 |            &\vdotswithin{=} \\
 83 |            &= \frac{1}{1 - \gamma} r(\pi)
 84 | \end{align}
 85 | so we may as well have optimised for the \emph{undiscounted} average reward.\\
 86 | 
 87 | The root cause (note: why \emph{root} cause?) of the difficulties with the discounted control setting is that when we introduce function approximation we lose the policy improvement theorem. This is because when we change the discounted value of one state, we are not guaranteed to have improved the policy in any useful sense (e.g. generalisation could ruin the policy elsewhere). This is an area of open research.
 88 | 
 89 | \subsection{Differential Semi-gradient $n$-step Sarsa}
 90 | We generalise $n$-step bootstrapping by introducing an $n$-step version of the TD error  in this new setting. In order to do that, we first introduce the differential $n$-step return using function approximation
 91 | \begin{equation}
 92 |     G_{t:t+n} \doteq \sum_{i=t}^{n-1} \left( R_{i +1} - \bar{R}_{i+1} \right) + \hat{q}(S_{t+n}, A_{t+n}, \vec{w}_{t+n-1})
 93 | \end{equation}
 94 | with $G_{t:t+n} = G_t$ if $t+n \geq T$ as usual and where $\bar{R}_i$ are the estimates of $\bar{R}$. The $n$-step TD error is then defined as before just with the new $n$-step return
 95 | \[
 96 |     \delta_t \doteq G_{t:t+n} - \hat{q}(S_t, A_t, \vec{w}_t).
 97 | \]
 98 | Pseudocode for the use of this return in the Sarsa framework is given in the box below. Note that $\bar{R}$ is updated using the TD error rather than the latest reward (see Exercise 10.9).\\
 99 | 
100 | 
101 | \includegraphics[width=\textwidth]{\NotesImages/differential_semi_gradient_n_step_sarsa.png}\\
102 | 


--------------------------------------------------------------------------------
/notes/chapters/chapter11/chapter11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter11/chapter11.pdf


--------------------------------------------------------------------------------
/notes/chapters/chapter11/chapter11.tex:
--------------------------------------------------------------------------------
1 | \input{../../../header}
2 | 
3 | \begin{document}
4 |     \include{chapter11_content}
5 | \end{document}


--------------------------------------------------------------------------------
/notes/chapters/chapter11/chapter11_content.tex:
--------------------------------------------------------------------------------
 1 | \section{*Off-policy Methods with Approximation}
 2 | 
 3 | Function approximation turns out to be more difficult in the case of off-policy learning than it is in the on-policy case. This is because both the update target and the state distribution are different between the target and behaviour policies.
 4 | 
 5 | \subsection{Semi-gradient Methods}
 6 | Semi-gradient methods alter the update target to correspond to the target policy, but do not address the issue of the update distribution. As such, they may diverge in some cases (but they are often successfully used in practice nonetheless).\\
 7 | 
 8 | The tabular off-policy algorithms can be applied in this case, where we simply exchange the estimated value arrays for their equivalents under function approximation and incorporate the importance sampling ratio. For instance, the one-step, state-value algorithm is semi-gradient off-policy TD(0). This has the update
 9 | \begin{equation}
10 |     \vec{w}_{t+1} = \vec{w}_t + \alpha \rho_t \delta_t \grad \hat{v}(S_t, \vec{w}_t),
11 | \end{equation}
12 | where $\rho_t \doteq \rho_{t:t}$ is per-step importance sampling ratio. The TD error $\delta_t$ is defined as appropriate, with respect to the episodic and discounted reward or the continuing and undiscounted average reward according to the setting. \\
13 | 
14 | The corresponding one-step algorithm for state-values is semi-gradient Expected Sarsa. In the tabular case we did not use importance sampling for one-step, action-value methods, but with function approximation (and corresponding generalisation) it is not clear that all actions should be weighted equally.\\
15 | 
16 | We now give some of the multi-step generalisations.
17 | 
18 | \subsubsection*{$n$-step Semi-Gradient Expected Sarsa}
19 | The update is 
20 | \begin{equation}
21 |     \vec{w}_{t+n} = \vec{w}_t + \alpha \rho_{t+1}\cdots\rho_{t+n-1} [G_{t+n} - \hat{q}(S_t, A_t, \vec{w}_{t+n-1})] \grad \hat{q}(S_t, A_t, \vec{w}_{t+n-1})
22 | \end{equation}
23 | where $\rho_k = 1$ when $k \geq T$ and $G_{t:n} = G_t$ when $t+n\geq T$. The returns targets are defined for the episodic case as
24 | \[
25 |     G_{t:t+n} \doteq \sum_{i=0}^{n-1} \gamma^iR_{t+i+1} + \gamma^n \hat{q}(S_{t+n}, A_{t+n}, \vec{w}_{t+n-1})
26 | \]
27 | and in the continuing case as
28 | \[
29 |     G_{t:t+n} \doteq \sum_{i=0}^{n-1} \left(R_{t+i+1} -  \bar{R}_{t+i}\right) + \hat{q}(S_{t+n}, A_{t+n}, \vec{w}_{t+n-1}).
30 | \]
31 | 
32 | \subsubsection*{$n$-step Tree Backup Algorithm}
33 | The updates are
34 | \begin{align}
35 |     \vec{w}_{t+n} &= \vec{w}_{t+n-1} + \alpha [G_{t:t+n} - \hat{q}(S_t, A_t, \vec{w}_{t+n-1})]\grad \hat{q}(S_t, A_t, \vec{w}_{t+n-1}) \\
36 |     G_{t:t+n} &\doteq \hat{q}(S_t, A_t, \vec{w}_{t-1}) + \sum_{k=t}^{t+n-1} \delta_k \prod_{i=t+1}^k \gamma \pi(A_i \vert{} S_i),
37 | \end{align}
38 | where $\delta_t$ is defined in the episodic case as
39 | \[
40 |     \delta_t \doteq R_{t+1} + \gamma \sum_a \pi(a \vert{} S_{t+1}) \hat{q}(S_{t+1}, a, \vec{w}_t) -  \hat{q}(S_t, A_t, \vec{w}_t)
41 | \]
42 | and in the continuing case as
43 | \[
44 |     \delta_t \doteq R_{t+1}  - \bar{R}_t + \sum_a \pi(a \vert{} S_{t+1}) \hat{q}(S_{t+1}, a, \vec{w}_t) -  \hat{q}(S_t, A_t, \vec{w}_t)
45 | \]
46 | 
47 | 
48 | \setcounter{subsection}{2}
49 | \subsection{The Deadly Triad}
50 | Instability and divergence arise whenever we have all three of \emph{function approximation}, \emph{bootstrapping}, \emph{off-policy training}. Each of these has their benefits, so it is unclear which (if any!) is to be given up. Note that the difficulties are not due to uncertainty in the environment (they arise with DP), nor are they due specifically to control or generalised policy iteration.
51 | 
52 | \begin{description}
53 |     \item[Function Approximation] allows for generalisation, dimensionality reduction and reduction in complexity. Potentially also 
54 |     \item[Bootstrapping] Data and computationally efficient. Memory efficient.
55 |     \item[Off-policy Learning] Essential to some use cases (not yet mentioned in this book). Seems important to be able to learn from hypothetical actions.
56 | \end{description}
57 | 
58 | \subsection{Linear Value-function Geometry}
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/notes/chapters/chapter12/chapter12_content.tex:
--------------------------------------------------------------------------------
1 | \section{Eligibility Traces}


--------------------------------------------------------------------------------
/notes/chapters/chapter13/chapter13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter13/chapter13.pdf


--------------------------------------------------------------------------------
/notes/chapters/chapter13/chapter13.tex:
--------------------------------------------------------------------------------
1 | \input{../../../header}
2 | 
3 | \begin{document}
4 |     \include{chapter13_content}
5 | \end{document}
6 | 


--------------------------------------------------------------------------------
/notes/chapters/chapter14/chapter14_content.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter14/chapter14_content.tex


--------------------------------------------------------------------------------
/notes/chapters/chapter15/chapter15_content.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter15/chapter15_content.tex


--------------------------------------------------------------------------------
/notes/chapters/chapter16/chapter16_content.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter16/chapter16_content.tex


--------------------------------------------------------------------------------
/notes/chapters/chapter17/chapter17_content.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter17/chapter17_content.tex


--------------------------------------------------------------------------------
/notes/chapters/chapter2/chapter2_content.tex:
--------------------------------------------------------------------------------
 1 | \section{Multi-armed Bandits}
 2 | Reinforcement learning involves evaluative feedback rather than instructive feedback. We get told whether our actions are good ones or not, rather than what the single best action to take is. This is a key distinction between reinforcement learning and supervised learning.
 3 | 
 4 | \subsection{A $k$-armed Bandit Problem}
 5 | In the $k$-armed bandit problem there are $k$ possible actions, each of which yields a numerical reward drawn from a stationary probability distribution for that action. We want to maximise the expected total reward, taking an action at each \emph{time step}. Some notation:
 6 | 
 7 | \begin{itemize}
 8 |     \item Index timesteps by $t$
 9 |     \item Action $A_t$
10 |     \item Corresponding reward $R_t$
11 |     \item \emph{Value} of action $a$ is $q_*(a) = \mathbb{E}[R_t | A_t = a]$
12 |     \item Estimate of value of action $a$ at $t$ is denoted $Q_t(a)$
13 | \end{itemize}
14 | 
15 | We therefore want to choose $\{a_1, \dots, a_T\}$ to maximise $\sum_{t = 1}^T q_*(a_t)$.\\
16 | \mbox{}\\ 
17 | At each timestep, the actions with the highest estimated reward are called the \emph{greedy} actions. If we take this action, we say that we are \emph{exploiting} our understanding of the values of actions. The other actions are known as \emph{non-greedy} actions, sometimes we might want to take one of these to improve our estimate of their value. This is called \emph{exploration}. The balance between exploration and exploitation is a key concept in reinforcement learning.
18 | 
19 | 
20 | \subsection{Action-value Methods}
21 | We may like to form estimates of the values of possible actions and then choose actions according to these estimates. Methods such as this are known as \emph{action-value methods}. There are, of course, many ways of generating the estimates $Q_t(a)$. \\
22 | \mbox{}\\
23 | An $\varepsilon$-greedy method is one in which with probability $\varepsilon$ we take a random draw from all of the actions (choosing each action with equal probability), providing some exploration.
24 | 
25 | 
26 | \setcounter{subsection}{4}
27 | \subsection{Tracking a Non-stationary Problem}
28 | If we decide to implement the sample average method, then at each iteration that we choose the given action we update our estimate by
29 | \begin{equation}
30 |     Q_{n+1} = Q_n + \frac1n [R_n - Q_n]
31 | \end{equation}
32 | Note that this has the (soon to be familiar) form
33 | \begin{equation}
34 |     \mathrm{NewEstimate} \gets \mathrm{OldEstimate} + \mathrm{StepSize}\times[\mathrm{Target} - \mathrm{OldEstimate}].
35 | \end{equation}
36 | \mbox{}\\
37 | If the problem was non-stationary, we might like to use a time weighted exponential average for our estimates (\emph{exponential recency-weighted average}). This corresponds to a constant step-size $\alpha \in (0, 1]$ (you can check).
38 | \begin{equation}
39 |     Q_{n+1} = Q_n + \alpha [R_n - Q_n].
40 | \end{equation}
41 | \mbox{}\\
42 | We might like to vary the step-size parameter. Write $\alpha_n(a)$ for the step-size after the $n^{\mathsf{th}}$ reward from action $a$. Of course, not all choices of $\alpha_n(a)$ will give convergent estimates of the values of $a$. To converge with probability 1 we must have
43 | \begin{equation}
44 |     \sum_n \alpha_n(a) = \infty \quad\quad \mathsf{and} \quad\quad  \sum_n \alpha_n(a)^2 < \infty.
45 | \end{equation}
46 | Meaning that the coefficients must be large enough to recover from initial fluctuations, but not so large that they don't converge in the long run. Although these conditions are used in theoretical work, they are seldom used in empirical work or applications. (Most reinforcement learning problems have non-stationary rewards, in which case convergence is undesirable.)
47 | 
48 | \subsection{Optimistic Initial Values}
49 | The exponential recency weighted method is biased by the initial value one gives. If we like, we may set initial value estimates artificially high to encourage exploration in the short run -- this is called \emph{optimistic initial values}. This is a useful trick for stationary problems, but does not apply so well to non-stationary problems as the added exploration is only temporary.
50 | 
51 | 
52 | \subsection{Upper-Confidence Bound Action Selection}
53 | We might like to discriminate between potential explorative actions. Note that $\varepsilon$-greedy does not do this. We define the \emph{upper-confidence bound} action at $t$ as follows
54 | \begin{equation}
55 |     A_t \doteq \argmax_{a}\left[ \, Q_t(a)+ c \sqrt{\frac{\mathrm{ln}(t)}{N_t(a)}} \, \right]
56 | \end{equation}
57 | where $Q_t(a)$ is the value estimate for the action $a$ at time $t$, $c > 0$ is a parameter that controls the degree of exploration and $N_t(a)$ is the number of times that $a$ has been selected by time $t$. If $N_t(a) = 0$ then we consider $a$ a maximal action.\\
58 | 
59 | This approach favours actions with a higher estimated rewards but also favours actions with uncertain estimates (more precisely, actions that have been chosen few times).
60 | 
61 | 
62 | \subsection{Gradient Bandit Algorithms}
63 | Suppose that we choose actions probabilistically based on a preference for each action, $H_t(a)$. Let the action at $t$ be denoted by $A_t$. We then define the probability of choosing action $a$ via the softmax
64 | \begin{equation}
65 |     \pi_t(a) \doteq \P{}(A_t = a) = \frac{e^{H_t(a)}}{\sum_i e^{H_t(i)}}.
66 | \end{equation}
67 | We then iteratively perform updates according to 
68 | \begin{equation}
69 |     H_{t+1}(a) = H_t(a) + (R_t - \bar{R}_t)(\mathds{1}_{A_t = a} - \pi_t(a)),
70 | \end{equation}
71 | where $\bar{R}_t$ is the mean of previous rewards. The box in the notes shows that this is an instance of stochastic gradient ascent since the expected value of the update is equal to the update when doing gradient ascent on the (total) expected reward.


--------------------------------------------------------------------------------
/notes/chapters/chapter3/chapter3_content.tex:
--------------------------------------------------------------------------------
  1 | \section{Finite Markov Decision Processes}
  2 | We say that a system has the \emph{Markov property} if each state includes all information about the previous states and actions that makes a difference to the future.\\
  3 | 
  4 | The MDP provides an abstraction of the problem of goal-directed learning from interaction by modelling the whole thing as three signals: action, state, reward.\\
  5 | 
  6 | Together, the MDP and agent give rise to the \emph{trajectory} $S_0$, $A_0$, $R_1$, $S_1$, $A_1$, $S_2$, $R_2$, $\dots$. The action choice in a state gives rise (stochastically) to a state and corresponding reward.
  7 | 
  8 | \subsection{The Agent–Environment Interface}
  9 | We consider finite Markov Decision Processes (MDPs). The word finite refers to the fact that the states, rewards and actions form a finite set. This framework is useful for many reinforcement learning problems.\\
 10 | 
 11 | We call the learner or decision making component of a system the \emph{agent}. Everything else is the \emph{environment}. General rule is that anything that the agent does not have absolute control over forms part of the environment. For a robot the environment would include it's physical machinery. The boundary is the limit of absolute control of the agent, not of its knowledge.\\
 12 | 
 13 | The MDP formulation is as follows. Index time-steps by $t \in \mathbb{N}$. Then actions, rewards, states at $t$ represented by $A_t \in \mathcal{A}(s)$, $R_t \in \mathcal{R} \subset \mathbb{R}$, $S_t \in \mathcal{S}$. Note that the set of available actions is dependent on the current state.\\
 14 | 
 15 | A key quantity in an MDP is the following function, which defines the \emph{dynamics} of the system.
 16 | \begin{equation}
 17 |     p(s', r | s, a) \doteq \P{} (S_t = s', R_t = r | S_{t-1} = s, A_{t-1} = a)
 18 | \end{equation}
 19 | From this quantity we can get other useful functions. In particular we have the following: 
 20 | 
 21 | \begin{description}
 22 |     \item[state-transition probabilities]
 23 | \begin{equation}
 24 |     p(s' | s, a) \doteq \P{}(S_t = s'| S_{t-1} = s, A_{t-1}=A) = \sum_{r \in \mathcal{R}} p(s', r | s, a)
 25 | \end{equation}
 26 | note the abuse of notation using $p$ again; and,
 27 |     \item[expected reward]
 28 | \begin{equation}
 29 |     r(s, a) = \mathbb{E}[R_t | S_{t-1} = s, A_{t-1} = a] = \sum_{r \in \mathcal{R}} r \sum_{s' \in \mathcal{S}} p(s', r | s, a).
 30 | \end{equation}
 31 | \end{description}
 32 | 
 33 | 
 34 | \subsection{Goals and rewards}
 35 | We have the \emph{reward hypothesis}, which is a central assumption in reinforcement learning:
 36 | \begin{quote}
 37 |     All of what we mean by goals and purposes can be well thought of as the maximisation of the expected value of the cumulative sum of a received scalar signal (called reward).
 38 | \end{quote}
 39 | 
 40 | 
 41 | \subsection{Returns and Episodes}
 42 | Denote the sequence of rewards from time $t$ as $R_{t+1}$, $R_{t+2}$, $R_{t+3}$, $\dots$. We seek to maximise the \emph{expected return} $G_t$ which is some function of the rewards. The simplest case is where $G_t = \sum_{\tau > t} R_\tau$.\\
 43 | 
 44 | In some applications there is a natural final time-step which we denote $T$. The final time-step corresponds to a \emph{terminal state} that breaks the agent-environment interaction into subsequences called \emph{episodes}. Each episode ends in the same terminal state, possibly with a different reward. Each starts independently of the last, with some distribution of starting states. We denote the set of states including the terminal state as $\mathcal{S}^+$\\
 45 | 
 46 | Sequences of interaction without a terminal state are called \emph{continuing tasks}. \\
 47 | 
 48 | We define $G_t$ using the notion of \emph{discounting}, incorporating the \emph{discount rate} $0 \leq \gamma \leq 1$. In this approach the agent chooses $A_t$ to maximise 
 49 | \begin{equation}
 50 |     G_t \doteq \sum_{k = 0}^{\infty} \gamma^k R_{t+k+1}.
 51 | \end{equation}
 52 |  This sum converges wherever the sequence $R_t$ is bounded. If $\gamma = 0$ the agent is said to be myopic. We define $G_T = 0$. Note that
 53 | \begin{equation}
 54 |      G_t = R_{t+1} + \gamma G_{t+1}.
 55 | \end{equation}\\
 56 | 
 57 | Note that in the case of finite time steps or an episodic problem, then the return for each episode is just the sum (or whatever function) of the returns in that episode.
 58 | 
 59 | 
 60 | \subsection{Unified Notation for Episodic and Continuing Tasks}
 61 | We want to unify the notation for episodic and continuing learning. \\
 62 | 
 63 | We introduce the concept of an \emph{absorbing state}. This state transitions only to itself and gives reward of zero.\\
 64 | 
 65 | To incorporate the (disjoint) possibilites that $T=\infty$ or $\gamma = 1$ in our formulation of the return, we might like to write
 66 | \begin{equation}
 67 |     G_t \doteq \sum_{k=t+1}^T \gamma^{k-t-1}R_k.
 68 | \end{equation}
 69 | 
 70 | 
 71 | \subsection{Policies \& Value Functions}
 72 | \subsubsection*{Policy}
 73 | A \emph{policy} $\pi(a|s)$ is a mapping from states to the probability of selecting actions in that state. If an agent is following policy $\pi$ and at time $t$ is in state $S_t$, then the probability of taking action $A_t$ is $\pi(a|s)$. Reinforcement learning is about altering the policy from experience.\\
 74 | 
 75 | \subsubsection*{Value Functions}
 76 | As we have seen, a central notion is the value of a state. The \emph{state-value function} of state $s$ under policy $\pi$ is the expected return starting in $s$ and following $\pi$ thereafter. For MDPs this is
 77 | \begin{equation}
 78 |     v_\pi \doteq \Epi[G_t | S_t = s],
 79 | \end{equation}
 80 | where the subscript $\pi$ denotes that this is an expectation taken conditional on the agent following policy $\pi$. \\
 81 | 
 82 | Similarly, we define the \emph{action-value function} for policy $\pi$ to be the expected return from taking action $a$ in state $s$ and following $\pi$ thereafter
 83 | \begin{equation}
 84 |     q_\pi(s, a) \doteq \Epi[G_t | S_t = s, A_t = a].
 85 | \end{equation}
 86 | 
 87 | The value functions $v_\pi$ and $q_\pi$ can be estimated from experience.\\
 88 | 
 89 | \subsubsection*{Bellman Equation}
 90 | 
 91 | The Bellman equations express the value of a state in terms of the value of its successor states. They are a consistency condition on the value of states. 
 92 | 
 93 | \begin{align}
 94 |     v_{\pi}(s) &= \Epi{}[G_t | S_t = s] \\
 95 |              &= \Epi{}[R_{t+1} + \gamma G_{t+1} | S_t = s] \\
 96 |              &= \sum_{a \in \mathcal{A}(s)} \pi(a|s) \sum_{s', r} p(s', r | s, a) \left[r + \gamma \Epi{}[G_{t+1} | S_{t+1} = s']\right] \\
 97 |              &=  \sum_{a \in \mathcal{A}(s)} \pi(a|s) \sum_{s', r} p(s', r | s, a) [r + \gamma v_{\pi}(s')]
 98 | \end{align} 
 99 |     
100 | 
101 | The value function $v_\pi$ is the unique solution to its Bellman equation.
102 | 
103 | 
104 | \subsection{Optimal Policies \& Optimal Value Functions}
105 | We say that $\pi \geq \pi'$ iff $v_\pi (s) \geq v_{\pi'}(s) \quad \forall s \in \mathcal{S}$. The policies that are optimal in this sense are called optimal policies. There may be multiple optimal policies. We denote all of them by $\pi_*$.\\
106 | 
107 | The optimal policies share the same optimal value function $v_*(s)$
108 | \begin{equation}
109 |     v_*(s) \doteq \max_\pi v_\pi(s) \quad \forall s \in \mathcal{S}.
110 | \end{equation}
111 | They also share the same optimal action-value function $q_*(s, a)$
112 | \begin{equation}
113 |     q_*(s, a) = \max_\pi q_\pi (s, a) \quad \forall s \in \mathcal{S}, a \in \mathcal{A}(s),
114 | \end{equation}
115 | this is the expected return from taking action $a$ in state $s$ and thereafter following the optimal policy.
116 | \begin{equation}
117 |     q_*(s, a) = \E{} [R_{t+1} + \gamma v_*(S_{t+1}) | S_{t} = s, A_t = a].
118 | \end{equation}\\
119 | 
120 | Since $v_*$ is a value function, it must satisfy a Bellman equation (since it is simply a consistency condition). However, $v_*$ corresponds to a policy that always selects the maximal action. Hence 
121 | \begin{equation}
122 |     v_*(s) = \max_a \sum_{s', r} p(s', r|s, a) [r + \gamma v_*(s')].
123 | \end{equation}
124 | Similarly,
125 | \begin{align}
126 |     q_*(s, a) &= \mathbb{E} [R_{t+1} + \gamma \max_{a'}q_*(S_{t+1}, a') | S_t=s, A_t = a]\\
127 |               &= \sum_{s', r} p(s', r| s, a ) [r + \gamma \max_{a'}q_*(s', a')].
128 | \end{align} \\
129 | 
130 | Note that once one identifies an optimal value function $v_*$, then it is simple to find an optimal policy. All that is needed is for the policy to act greedily with respect to $v_*$. Since $v_*$ encodes all information on future rewards, we can act greedily and still make the long term optimal decision (according to our definition of returns).\\
131 | 
132 | Having $q_*$ is even better since we don't need to check $v_*(s')$ in the succeeding states $s'$, we just find $a_* = \argmax_a q_*(s, a)$ when in state $s$.
133 | 


--------------------------------------------------------------------------------
/notes/chapters/chapter4/chapter4_content.tex:
--------------------------------------------------------------------------------
  1 | 
  2 | \section{Dynamic Programming}
  3 | 
  4 | The term Dynamic Programming (DP) refers to a collection of algorithms that can be used to compute optimal policies given perfect model of the environment as a Markov Decision Process (MDP). DP methods tend to be computationally expensive and we often don't have a perfect model of the environment, so they aren't used in practice. However, they provide useful theoretical basis for the rest of reinforcement learning. \\
  5 | 
  6 | Unless stated otherwise, will assume that the environment is a finite MDP. If the state or action space is continuous, then we will generally discretise it and apply finite MDP methods to the approximated problem.\\
  7 | 
  8 | The key idea of DP, and of reinforcement learning generally, is the use of value functions to organize and structure the search for good policies. We use DP and the Bellman equations to find optimal value functions. 
  9 | 
 10 | \subsection{Policy Evaluation (Prediction)}
 11 | We can use the Bellman equation for the state-value function $v_\pi$ to construct an iterative updating procedure.
 12 | 
 13 | \subsubsection*{Iterative Policy Evaluation}
 14 | Consider a sequence of approximate value functions $v_0, v_1, v_2, \dots$ each mapping $\mathcal{S}^{+}$ to $\mathbb{R}$. The initial approximation, $v_0$, is chosen arbitrarily (except that the terminal state, if any, must be given value $0$), and each successive approximation is obtained by using the Bellman equation for $v_\pi$ as an update rule:
 15 | 
 16 | \begin{align}
 17 |     v_{k+1} &\doteq \Epi [R_{t+1} + \gamma v_{k}(S_{t+1}) | S_t = s] \\
 18 |             &= \sum_a \pi(s|a) \sum_{s', r} p(s', r| s, a) \left[r + \gamma v_k(s')\right]
 19 | \end{align}
 20 | 
 21 | Clearly, $v_k = v_\pi$ is a fixed point. The sequence $\{v_k\}$ can be shown in general to converge to $v_\pi$ as $k \to \infty$ under the same conditions that guarantee the existence of $v_\pi$. This algorithm is called \emph{iterative policy evaluation}. This update rule is an instance of an \emph{expected update} because it performs the updates by taking an expectation over all possible next states rather than by taking a sample next state.\\
 22 | 
 23 | \subsection{Policy Improvement}
 24 | \subsubsection*{Policy Improvement Theorem}
 25 | 
 26 | Let $\pi$, $\pi'$ be any pair of deterministic policies, such that
 27 | \begin{equation}
 28 |     q_\pi(s, \pi'(s)) \geq v_\pi(s) \quad \forall s \in \mathcal{S}.
 29 | \end{equation}
 30 | That is, $\pi'$ is as least as good as $\pi$. Then we have (shown below)
 31 | \begin{equation}
 32 |     v_{\pi'}(s) \geq v_\pi(s) \quad \forall s \in \mathcal{S}
 33 | \end{equation}
 34 | so $\pi'$ gives at least as good (expected) return as $\pi$.\\
 35 | 
 36 | The argument below also shows that if $q_\pi(s, \pi'(s)) > v_\pi(s)$ at any $s$, then there is at least one $s$ for which $v_{\pi'}(s) > v_\pi(s)$.
 37 | \subsubsection*{proof:}
 38 | \begin{align*}
 39 |     v_\pi(s) & \leq q_\pi(s, \pi'(s)) \\
 40 |              & = \E{}[R_{t+1} + \gamma v_\pi(S_{t+1}) | S_t=s, A_t=\pi'(s)] \\
 41 |              & = \E{}_{\pi'} [R_{t+1} + \gamma v_\pi(S_{t+1}) | S_t=s] \\
 42 |              & \leq \E{}_{\pi'} [R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \dots| S_t=s] \\
 43 |              & = v_{\pi'}(s)
 44 | \end{align*}
 45 | 
 46 | \subsubsection*{Policy Improvement Algorithm}
 47 | Now consider a policy that is greedy with respect to $q_\pi(s, a)$. Define 
 48 | \begin{align}
 49 |     \pi'(s) &= \argmax_a q_\pi(s, a) \\ 
 50 |             &= \argmax_a \E{} [R_{t+1} + \gamma v_\pi(S_{t+1}) | S_t=s, A_t=a] \\
 51 |             &= \argmax_a \sum_{s', r} p(s', r|s, a)[ r + \gamma v_\pi(s')].
 52 | \end{align}
 53 | Now we can use $v_\pi$ to get $\pi' \geq \pi$, then use $v_{\pi'}$ to get \emph{another} policy. (In the above, ties are broken arbitrarily when the policy is deterministic. If the policy is stochastic, we accept any policy that assigns zero probability to sub-optimal actions.)\\
 54 | 
 55 | Note that by construction
 56 | \[
 57 |     q_\pi(s, \pi'(s)) \geq v_\pi(s)
 58 | \]
 59 | therefore
 60 | \[
 61 |     v_{\pi'} \geq v_\pi
 62 | \]
 63 | so we get from this process a monotonically increasing sequence of policies.\\
 64 | 
 65 | Note also that if $\pi'$ is as good as $\pi$ then $v_{\pi'} = v_\pi$ and $\forall s \in \mathcal{S}$
 66 | \begin{align*}
 67 |     v_\pi &= \max_a \E{}[R_{t+1} + \gamma v_{\pi'(S_{t+1})}| S_t=s, A_t=a]\\
 68 |           &= \max_a \sum_{s', r} p(s', r|s, a)(r + \gamma v_{\pi'}(s'))
 69 | \end{align*}
 70 | which is the Bellman optimality condition for $v_*$, so both $\pi$ and $\pi'$ are optimal. This means that policy improvement gives a strictly better policy unless the policy is already optimal. \\
 71 | 
 72 | The policy improvement theorem holds for stochastic policies too, but we don't go into that here.
 73 |           
 74 | 
 75 | \subsection{Policy Iteration}
 76 | We can exploit policy improvement iteratively to get the policy iteration algorithm.
 77 | 
 78 | \includegraphics[width=\textwidth]{\ProjectDir/data/notes_images/policy_iteration_algorithm.png}
 79 | \mbox{}\\
 80 | A finite MDP has only a finite number of policies (as long as they are deterministic, of course) so this process is guaranteed to converge.
 81 | 
 82 | \subsection{Value Iteration}
 83 | Policy iteration can be slow because each iteration involves running the entire policy evaluation until convergence. \\
 84 | 
 85 | It turns out that one can truncate the policy evaluation step of policy iteration in many ways without losing convergence guarantees. One special case of this is \emph{value iteration}, where we truncate policy evaluation after only one update of each state. This algorithm converges to $v_*$ under the same conditions that guarantee the existence of $v_*$. 
 86 | 
 87 | \includegraphics[width=\textwidth]{\ProjectDir/data/notes_images/value_iteration_algorithm.png}
 88 | \mbox{}\\
 89 | 
 90 | Note the $\max_a$ in the assignment of $V(s)$, since we only one sweep of the state space and then choose the greedy policy.\\
 91 | 
 92 | It may be more efficient to interpose multiple policy evaluation steps in between policy improvement iterations, all of these algorithms converge to an optimal policy for discounted finite MDPs. 
 93 | 
 94 | \subsection{Asynchronous Dynamic Programming}
 95 | The DP methods that we have described so far all involve a full sweep of the state space on each iteration. This is potentially a very costly procedure. \\
 96 | 
 97 | \emph{Asynchronous} DP algorithms update the values in-place and cover states in any order whatsoever. The values of some states may be updated several times before the values of others are updated once. To converge correctly, however, an asynchronous algorithm must continue to update the values of all the states: it can’t ignore any state after some point in the computation.\\
 98 | 
 99 | Asynchronous DPs give a great increase in flexibility, meaning that we can choose the updates we want to make (even stochastically) based on the interaction of the agent with the environment. This procedure might not reduce computation time in total if the algorithm is run to convergence, but it could allow for a better rate of progress for the agent.
100 | 
101 | \subsection{Generalised Policy Iteration}
102 | We use the term \emph{generalised policy iteration} (GPI) to refer to the general idea of letting policy evaluation and policy improvement processes interact, independent of the granularity and other details of the two processes. Almost all reinforcement learning methods are well described as GPI, including the policy iteration algorithms we have discussed in this section. GPI works via the competing but complementary nature of the two processes. In some cases it can be guaranteed to converge. 
103 | 
104 | \subsection{Efficiency of Dynamic Programming}
105 | If we ignore a few technical details, then the (worst case) time DP methods take to find an optimal policy is polynomial in the number of states and actions. Compare this to the searching the states directly, which is exponential.


--------------------------------------------------------------------------------
/notes/chapters/chapter6/chapter6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter6/chapter6.pdf


--------------------------------------------------------------------------------
/notes/chapters/chapter6/chapter6.tex:
--------------------------------------------------------------------------------
1 | \input{../../../header}
2 | 
3 | \begin{document}
4 |     \include{chapter6_content}
5 | \end{document}
6 |     


--------------------------------------------------------------------------------
/notes/chapters/chapter6/chapter6_content.tex:
--------------------------------------------------------------------------------
  1 | \section{Temporal-Difference Learning}
  2 | We first focus on the prediction problem, that is, finding $v\_pi$ given a $\pi$. The control problem, finding $\pi_*$, is approached using the GPI framework.
  3 | 
  4 | \subsection{TD Prediction}
  5 | 
  6 | \subsubsection*{Connection between TD, MC \& DP}
  7 | Monte-Carlo methods wait until the end of an episode to update the values. A simple MC update suitable for non-stationary environments is
  8 | \begin{equation}
  9 |     V(S_t) \leftarrow V(S_t) + \alpha [G_t - V(S_t)]
 10 | \end{equation}
 11 | we will call this \emph{constant-$\alpha$ MC}. Temporal difference learning (TD) increments the values at each timestep. The following is the TD(0) (or one-step TD) update which is made at $t+1$ (we will see TD($\lambda$) in Chapter 12)
 12 | \begin{equation}
 13 |     V(S_t) \leftarrow V(S_t) + \alpha [R_{t+1} + \gamma V(S_{t+1}) - V(S_t)].
 14 | \end{equation}
 15 | The key difference is that MC uses $G_t$ as the target whereas TD(0) uses $R_{t+1} + \gamma V(S_{t+1})$. TD uses an estimate in forming the target, hence is known as a \emph{bootstrapping method}. Below is TD(0) in procedural form. \\
 16 | 
 17 | \includegraphics[width=\textwidth]{\NotesImages/TD0_prediction.png} \\
 18 | 
 19 | The core of the similarity between MC and TD is down to the following relationship
 20 | \begin{align}
 21 |     v_\pi(s) &\doteq \Epi{}[G_t|S_t = s]\\
 22 |              &= \Epi{}[R_{t+1} + \gamma G_{t+1}|S_t = s]\\
 23 |              &= \Epi{}[R_{t+1} + \gamma V(S_{t+1})|S_t = s]
 24 | \end{align}
 25 | 
 26 | \begin{itemize}
 27 |     \item MC uses an estimate of the first line, since it uses sample returns to approximate the expectation
 28 |     \item DP uses an estimate of the final line, because it approximates $v_\pi$ by $V$
 29 |     \item TD does both, it samples the returns like MC and also uses the current value estimates in the target
 30 | \end{itemize}
 31 | 
 32 | \subsubsection*{TD Error}
 33 | We can think of the TD(0) update as an error, measuring the difference between the estimated value for $S_t$ and the better estimate of $R_{t+1} + \gamma V(S_{t+1})$. We define the \emph{TD error}
 34 | \begin{equation}
 35 |     \delta_t \doteq R_{t+1} + \gamma V(S_{t+1}) - V(S_t),
 36 | \end{equation}
 37 | now if the array $V$ does not change within the episode we can show (by simple recursion) that the MC error can be written
 38 | \begin{equation}
 39 |     G_t - V(S_t) = \sum_{k=t}^{T-1} \gamma_{k-t}\delta_k.
 40 | \end{equation}
 41 | 
 42 | \subsection{Advantages of TD Prediction Methods}
 43 | \begin{itemize}
 44 |     \item TD methods do not require a model of the environment
 45 |     \item TD methods are implements online, which can speed convergence vs. MC methods which must wait until the end of (potentially very long) episodes before learning. TD methods can be applied to continuing tasks for the same reason
 46 |     \item TD methods learn from all actions, whereas MC methods required that the tails of the episodes be greedy
 47 |     \item For any fixed policy $\pi$, TD(0) has been proved to converge to $v_\pi$, in the mean with probability 1 if the step-size parameter decreases according to the usual stochastic approximation conditions
 48 |     \item It is an open question as to whether TD methods converge faster than constant-$\alpha$ MC methods in general, though this seems to be the case in practice
 49 | \end{itemize}
 50 | 
 51 | \subsection{Optimality of TD(0)}
 52 | Given a finite number of training steps or episodes, a common method for estimating $V$ is to present the experience repeatedly until $V$ converges. We call the following \emph{batch updating}: given finite experience following a policy and an approximate value function $V$, calculate the increments for each $t$ that is non-terminal and change $V$ once by the sum of all the increments. Repeat until $V$ converges.\\
 53 | 
 54 | Under batch updating, we can make some comments on the strengths of TD(0) relative to MC. In an online setting we can do no better than to guess that online TD is faster than constant-$\alpha$ MC because it is similar towards the batch updating solution.
 55 | 
 56 | \begin{itemize}
 57 |     \item Under batch updating, MC methods always find estimates that minimize the mean-squared error on the training set.
 58 |     \item Under batch updating, TD methods always finds the estimate that would be exactly correct for the maximum-likelihood model of the Markov process. The MLE model is the one in which the estimates for the transition probabilities are the fraction of observed occurrences of each transition.
 59 |     \item We call the value function calculated from the MLE model the \emph{certainty-equivalence estimate} because it is equivalent to assuming that the estimate of the underlying process is exact. In general, batch TD(0) converges to the certainty equivalence estimate.
 60 | \end{itemize}
 61 |     
 62 | \subsection{Sarsa: On-policy TD Control}
 63 | We now use TD methods to attack the control problem.  The \emph{Sara} update is as follows
 64 | \begin{equation}
 65 |     Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha[R_{t+1} + \gamma Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t)].
 66 | \end{equation}
 67 | This update is done after every transition from a non-terminal state $S_t$. If $S_{t+1}$ is terminal then we set $Q(S_{t+1}, A_{t+1}) = 0$. Note that this rule uses the following elements $(S_t, A_t, R_{t+1}, S_{t+1}, A_{t+1})$ which gives rise to the name Sarsa. The theoroms regarding convergene of the state-value versions of this update apply here too. \\
 68 | 
 69 | We write an on-policy control algorithm using Sarsa in the box below, at each time step we move the policy towards the greedy policy with respect to the current action-value function. Sarsa converges with probability 1 to an optimal policy and action-value function as long as all state action pairs are visited infinitely often and the policy also converges to the greedy policy in the limit (e.g. maybe $\pi$ is $\varepsilon$-greedy with $\varepsilon=\frac1t$). \\
 70 | 
 71 | \includegraphics[width=\textwidth]{\NotesImages/sarsa_on_policy_control.png} \\
 72 | 
 73 | \subsection{Q-learning: Off-policy TD Control}
 74 | The \emph{Q-learning} update is 
 75 | \begin{equation}
 76 |     Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha [R_{t+1} + \gamma \max_a Q(S_{t+1}, a) - Q(S_t, A_t)].
 77 | \end{equation}
 78 | The learning function $Q$ directly approximates $q_*$. All that is required for convergence is that all pairs continue to be updated. An algorithm for Q-learning is given in the box below. \\
 79 | 
 80 | \includegraphics[width=\textwidth]{\NotesImages/q_learning.png}\\
 81 | 
 82 | \subsection{Expected Sarsa}
 83 | The update rule for \emph{Expected Sarsa} is 
 84 | \begin{align}
 85 |         Q(S_t, A_t) & \leftarrow Q(S_t, A_t) + \alpha [R_{t+1} + \gamma \E{}[Q(S_{t+1}, A_{t+1})|S_{t+1}] - Q(S_t, A_t)] \\
 86 |                     & \leftarrow Q(S_t, A_t) + \alpha [R_{t+1} + \gamma \sum_a \pi(a|S_{t+1}) Q(S_{t+1}, a) - Q(S_t, A_t)].
 87 | \end{align}
 88 | This algorithm moves deterministically in the same direction as Sarsa moves in \emph{expectation}, hence the name. It is more computationally complex than Sarsa, but eliminates the variance due to random selection of $A_{t+1}$. Given the same amount of experiences, it generally performs slightly better than Sarsa.
 89 | 
 90 | \subsection{Maximisation Bias and Double Learning}
 91 | All the control algorithms we have discussed so far involve some sort of maximisation in the construction of their target policies. This introduces a positive bias to the value estimates because they form uncertain estimates of the true values. This is known as the \emph{maximisation bias}. It is essentially down to the fact the the expectation of the max of a sample is $\geq$ the max of the expected values of the samples.\\
 92 | 
 93 | To solve this we introduce the idea of \emph{double learning}, in which we learn two independent sets of value estimates $Q_1$ and $Q_2$, then at each time step we choose one of them at random and update it using the other as a target. This produces two unbiased estimates of the action-values (which could be averaged). Below we show an algorithm for \emph{double Q-learning}. \\
 94 | 
 95 | \includegraphics[width=\textwidth]{\NotesImages/double_q_learning.png} \\ 
 96 | 
 97 | 
 98 | \subsection{Games,  Afterstates, and other Special Cases}
 99 | In this book we try to present a uniform approach to solving tasks, but sometimes more specific methods can do much better. \\
100 | 
101 | We introduce the idea of \emph{afterstates}. Afterstates are relevant when the agent can deterministically change some aspect of the environment. In these cases, we are better to value the resulting state of the environment, after the agent has taken action and before any stochasticity, as this can reduce computation and speed convergence.\\
102 | 
103 | Take chess as an example. One should choose as states the board positions \emph{after} the agent has taken a move, rather than before. This is because there are multiple states at $t$ than can lead to the board position that the opponent sees at $t+1$ (assuming we move second) via deterministic actions of the agent. 
104 | 


--------------------------------------------------------------------------------
/notes/chapters/chapter7/chapter7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter7/chapter7.pdf


--------------------------------------------------------------------------------
/notes/chapters/chapter7/chapter7.tex:
--------------------------------------------------------------------------------
1 | \input{../../../header}
2 | 
3 | \begin{document}
4 |     \include{chapter7_content}
5 | \end{document}


--------------------------------------------------------------------------------
/notes/chapters/chapter7/chapter7_content.tex:
--------------------------------------------------------------------------------
  1 | \section{$n$-step Bootstrapping}
  2 | $n$-step methods allow us to observe multiple time-steps of returns before updating a state with the observed data and a bootstrapped estimate of the value of the $n$th succeeding state.
  3 | 
  4 | \subsection{$n$-step TD Prediction}
  5 | Define the $n$-step return
  6 | \begin{equation}
  7 |     G_{t:t+n} \doteq \sum_{i=t}^{t+n-1}\gamma^{i-t}R_{i+1} + \gamma^n V_{t+n-1}(S_{t+n})
  8 | \end{equation}
  9 | where $n \geq 1$, $0 \leq t < T - n$ and $V_i$ is the estimated state-value function as of time $i$. If $t + n > T$ then $G_{t+n} \equiv G_t$, the standard return. The $n$-step return is the target for \emph{$n$-step TD} methods, note that $n-1$ rewards are observed and the succeeding value is bootstrapped with the latest estimate of the value function. The corresponding update for state-values is
 10 | \begin{equation}
 11 |     V_{t+n}(S_t) = V_{t+n - 1}(S_t) + \alpha [G_{t:t+n} - V_{t+n-1}(S_{t})]  \quad\quad 0 \leq t < T.
 12 | \end{equation}
 13 | Note that Monte-Carlo can be thought of as TD($\infty$)Pseudocode for $n$-step TD is given in the box below.\\
 14 | 
 15 | \includegraphics[width=\textwidth]{\NotesImages/n_step_td_state_values.png}\\
 16 | 
 17 | The $n$-step return obeys the \emph{error-reduction property}, and because of this $n$-step TD can be shown to converge to correct predictions (given a policy) under appropriate technical conditions. This property states that the $n$-step return is a better estimate than $V_{t+n-1}$ in the sense that the error on the worst prediction is always smaller
 18 | \begin{equation}
 19 |     \max_s \left|\Epi[G_{t:t+n}|S_t=s] - v_\pi(s)\right| \leq \gamma^n \max_s \left|V_{t+n-1}(s) - v_\pi(s)\right|
 20 | \end{equation}
 21 | 
 22 | \subsection{$n$-step Sarsa}
 23 | \subsubsection*{Sarsa}
 24 | We develop $n$-step methods for control. We generalise Sarsa to $n$-step Sarsa, or Sarsa($n$). This is done in much the same way as above, but with action-values as opposed to state-values. The $n$-step return in this case is defined as
 25 | \begin{equation}
 26 |     G_{t:t+n} \doteq \sum_{i=t}^{t+n-1}\gamma^{i-t}R_{i+1} + \gamma^n Q_{t+n-1}(S_{t+n}, A_{t+n})
 27 | \end{equation}
 28 | where $n \geq 1$, $0 \leq t < T - n$ and $Q_i$ is the estimated action-value function as of time $i$. If $t + n > T$ then $G_{t+n} \equiv G_t$, the standard return. The corresponding update is
 29 | \begin{equation}
 30 |     Q_{t+n}(S_t, A_t) = Q_{t+n-1}(S_t, A_t) + \alpha [G_{t:t+n} - Q_{t+n-1}(S_{t}, A_{t})]  \quad\quad 0 \leq t < T.
 31 | \end{equation}
 32 | 
 33 | \subsubsection*{Expected Sarsa}
 34 | We define $n$-step expected Sarsa similarly
 35 | \begin{equation}
 36 |     G_{t:t+n} \doteq \sum_{i=t}^{t+n-1}\gamma^{i-t}R_{i+1} + \gamma^n \bar{V}_{t+n-1}(S_{t+n})
 37 | \end{equation}
 38 | where $n \geq 1$, $0 \leq t < T - n$ and $\bar{V}_i$ is the \emph{expected approximate value} of state $s$
 39 | \begin{equation}
 40 |     \bar{V}_i(s) \doteq \sum_a \pi(a|s)Q_i(s, a).
 41 | \end{equation} 
 42 | As always, if $t + n > T$ then $G_{t+n} \equiv G_t$, the standard return. The corresponding update is formally the same as above\\
 43 | 
 44 | 
 45 | \includegraphics[width=\textwidth]{\NotesImages/n_step_sarsa.png}\\
 46 | 
 47 | \subsection{$n$-step Off-policy Learning}
 48 | We can learn with $n$-step methods off-policy using the importance sampling ratio (target policy $\pi$ and behaviour policy $b$)
 49 | \[
 50 |     \rho_{t:h} \doteq \prod_{k=t}^{\text{min}(h, T-1)}\frac{\pi(A_k|S_k)}{b(A_k|S_k)}.
 51 | \]
 52 | For state-values we have 
 53 | \[
 54 |     V_{t+n}(S_t) \doteq V_{t+n-1}(S_t) + \alpha \rho_{t:t+n-1}[G_{t:t+n} - V_{t+n-1}(S_t)]
 55 | \]
 56 | and for action-values we have
 57 | \[
 58 |     Q_{t+n}(S_t, A_t) = Q_{t+n-1}(S_t, A_t) + \alpha \rho_{t+1:t+n-1}[G_{t:t+n} - Q_{t+n-1}(S_t, A_t)]
 59 | \]
 60 | note that for action values the importance sampling ratio starts one time-step later, because we are attempting to discriminate between actions at time $t$.\\
 61 | 
 62 | \includegraphics[width=\textwidth]{\NotesImages/off_policy_n_step_sarsa.png}\\
 63 | 
 64 | \subsection{*Per-decision Methods with Control Variates}
 65 | We have the standard recursion relation for the $n$-step return
 66 | \[
 67 |     G_{t:h} = R_{t+1} + \gamma G_{t+1:h}.
 68 | \]
 69 | For an off-policy algorithm, one would be tempted to simply weight this target by the importance sampling ratio. This method, however, shrinks the estimated value functions when the importance sampling ratio is 0, hence increasing variance. We thus introduce the \emph{control-variate} $(1 - \rho_t)V_{h-1}(S_t)$, giving an off-policy update of 
 70 | \[
 71 |     G_{t:h} = \rho_t (R_{t+1} + \gamma G_{t+1:h}) + (1 - \rho_t)V_{h-1}(S_t)
 72 | \]
 73 | where $G_{h:h} = V_{h-1}(S_h)$. Note that the control-variate has expected value 0, since the factors are uncorrelated and the expected value of the importance sampling ratio is 1.\\
 74 | 
 75 | We can do a similar thing for action-values
 76 | \[
 77 |     G_{t:h} \doteq R_{t+1} + \gamma \rho_{t+1:h}\left( G_{t+1:h} - Q_{h-1}(S_{t+1}, A_{t+1}) \right) - \gamma \bar{V}_{h-1}(S_{t+1}),
 78 | \]
 79 | where once again the importance sampling ratio starts one time-step later.
 80 | 
 81 | \subsubsection*{Control Variates in General}
 82 | Suppose we want to estimate $\mu$ and assume we have an unbiased estimator for $\mu$ in $m$. Suppose we calculate another statistic $t$ such that $\mathbb{E}\left[t\right]=\tau$ is a known value. Then
 83 | \[
 84 |     m^\star = m + c\left(t-\tau\right)
 85 | \]
 86 | is also an unbiased estimator for $\mu$ for any $c$, with variance
 87 | \[
 88 |     \textrm{Var}\left(m^{\star}\right)=\textrm{Var}\left(m\right) + c^2\,\textrm{Var}\left(t\right) + 2c\,\textrm{Cov}\left(m,t\right).
 89 | \]
 90 | 
 91 | It is easy to see that taking
 92 | 
 93 | \[
 94 |     c = - \frac{\textrm{Cov}\left(m,t\right)}{\textrm{Var}\left(t\right)}
 95 | \]
 96 | minimizes the variance of $m^{\star}$. With this choice
 97 | 
 98 | \begin{align}
 99 | \textrm{Var}(m^{\star}) & =\textrm{Var}(m) - \frac{\left[\textrm{Cov}(m,t)\right]^2}{\textrm{Var}(t)} \\
100 | & = (1-\rho_{m,t}^2)\textrm{Var}(m)
101 | \end{align}
102 | where $\rho_{m,t}=\textrm{Corr}\left(m,t\right) $ is the Pearson correlation coefficient of $m$ and $t$. The greater the value of $|\rho_{m,t}|$, the greater the variance reduction achieved.
103 | 
104 | \subsection{Off-policy Learning Without Importance Sampling: The $n$-step Tree Backup Algorithm}
105 | We introduce the $n$-step \emph{tree-backup algorithm} algorithm using the return
106 | \begin{equation}
107 |     G_{t:t+n} \doteq R_{t+1} + \gamma \sum_{a \neq A_{t+1}} \pi(a|S_{t+1})Q_{t+n-1}(S_{t+1}, a) + \gamma \pi(A_{t+1}|S_{t+1})G_{t+1:t+n}
108 | \end{equation}
109 | for $t < T-1$, $n > 1$ and with $G_{i:i} = 0$ and $G_{T-1:t+n} = R_T$. This algorithm updates $S_t$ with bootstrapped, probability weighted action-values of \emph{all} actions that were not taken all along the trajectory and recursively includes the rewards realised, weighted by the probability of their preceding actions under the policy. Pseudocode given below.\\
110 | 
111 | \includegraphics[width=\textwidth]{\NotesImages/n_step_tree_backup.png}\\
112 |     
113 |     
114 | \subsection{*A Unifying Algorithm: $n$-step $Q(\sigma)$}
115 | We introduce an algorithm which, at each time step, can choose to either take an action as a sample as in Sarsa or to take an expectation over all possible actions as in tree-backup. \\
116 | 
117 | Define a sequence $\sigma_t \in [0, 1]$ that at each time step chooses a proportion of sampling vs. expectation. This generalises Sarsa and tree-backup by allowing each update to be a linear combination of the two ideas. The corresponding return (off-policy) is 
118 | \begin{align}
119 |     G_{t:h} \doteq R_{t+1} &+ \gamma \left(\sigma_{t+1}\rho_{t+1}  (1- \sigma_{t+1})\pi(A_{t+1}\vert S_{t+1})\right) \left( G_{t+1:h} - Q_{h-1}(S_{t+1}, A_{t+1}) \right) \\ 
120 |                            &+ \gamma \bar{V}_{h-1}(S_{t+1}),
121 | \end{align}
122 | for $t < h< T$, with $G_{h:h} \doteq Q_{h-1}(S_h, A_h)$ if $h<T$ and $G_{T-1:T} \doteq R_t$ if $h=T$. Pseudocode given below.\\
123 | 
124 | \includegraphics[width=\textwidth]{\NotesImages/off_policy_n_step_Q_sigma.png}\\
125 | 
126 |     
127 |     
128 |     
129 |     
130 |     
131 |     
132 |     
133 |     
134 |     
135 |     
136 | 
137 | 


--------------------------------------------------------------------------------
/notes/chapters/chapter8/.chapter.tex.swp:
--------------------------------------------------------------------------------
1 | b0VIM 8.0              F Bryn                                    Bryns-MBP                               ~Bryn/Programming/remote/ReinforcementLearningAnIntroduction/notes/chapter8/chapter.tex                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      utf-8 3210    #"! U                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 


--------------------------------------------------------------------------------
/notes/chapters/chapter8/chapter8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter8/chapter8.pdf


--------------------------------------------------------------------------------
/notes/chapters/chapter8/chapter8.tex:
--------------------------------------------------------------------------------
1 | \input{../../../header}
2 | 
3 | \begin{document}
4 |     \include{chapter8_content}
5 | \end{document}
6 | 


--------------------------------------------------------------------------------
/notes/chapters/chapter9/chapter9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/chapters/chapter9/chapter9.pdf


--------------------------------------------------------------------------------
/notes/chapters/chapter9/chapter9.tex:
--------------------------------------------------------------------------------
1 | \input{../../../header}
2 | 
3 | \begin{document}
4 |     \include{chapter9_content}
5 | \end{document}
6 | 


--------------------------------------------------------------------------------
/notes/notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brynhayder/reinforcement_learning_an_introduction/d8b1945f61a8397b684f8d8d800ed0d9308a9a35/notes/notes.pdf


--------------------------------------------------------------------------------
/notes/notes.tex:
--------------------------------------------------------------------------------
 1 | \input{/Users/Bryn/Programming/remote/ReinforcementLearningAnIntroduction/header}
 2 | 
 3 | \begin{document}
 4 | 
 5 | \pagenumbering{gobble}
 6 | 
 7 | {\huge Notes} \hfill {\huge Reinforcement Learning: An Introduction}\\
 8 | \Rule\\
 9 | \tableofcontents
10 | \mbox{}\\
11 | \Rule
12 | \mbox{}\\
13 | \clearpage
14 | 
15 | \pagenumbering{arabic}
16 | 
17 | \include{chapters/chapter1/chapter1_content}
18 | \include{chapters/chapter2/chapter2_content}
19 | \include{chapters/chapter3/chapter3_content}
20 | \include{chapters/chapter4/chapter4_content}
21 | \include{chapters/chapter5/chapter5_content}
22 | \include{chapters/chapter6/chapter6_content}
23 | \include{chapters/chapter7/chapter7_content}
24 | \include{chapters/chapter8/chapter8_content}
25 | \include{chapters/chapter9/chapter9_content}
26 | \include{chapters/chapter10/chapter10_content}
27 | \include{chapters/chapter11/chapter11_content}
28 | \include{chapters/chapter12/chapter12_content}
29 | \include{chapters/chapter13/chapter13_content}
30 | \include{chapters/chapter14/chapter14_content}
31 | \include{chapters/chapter15/chapter15_content}
32 | \include{chapters/chapter16/chapter16_content}
33 | \include{chapters/chapter17/chapter17_content}
34 | 
35 | \end{document}


--------------------------------------------------------------------------------
/todo.md:
--------------------------------------------------------------------------------
 1 | # TODO:
 2 | ---------
 3 | 
 4 | ## LaTeX
 5 | - read some latex docs to learn it properly
 6 | - figure out how to typeset algorithms 
 7 | - sort out the saving of images as eps (maybe you will need to put them into the same directory as the tex files)
 8 | - figure out some sort of latex command that can use a template document to compile the contents files (to save having chapterX.tex)
 9 | 
10 | 
11 | ## code
12 | - docstrings
13 | - tests for bandit problems code (and everything else...)
14 | - fix weird virtual environment behaviour (root conda getting in the way when you type source). Can't activate env atm
15 | - update requirements file
16 | - all the shebangs at the top of the scripts may be wrong given the virtual environments
17 | - change the policies etc. from hashmaps to arrays (?)
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------