├── CODEOWNERS
├── cheatsheet.pdf
├── udacity-logo.png
├── README.md
├── LICENSE.txt
├── .github
    └── workflows
    │   └── manual.yml
└── cheatsheet.tex


/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *           @udacity/active-public-content


--------------------------------------------------------------------------------
/cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/rl-cheatsheet/HEAD/cheatsheet.pdf


--------------------------------------------------------------------------------
/udacity-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/rl-cheatsheet/HEAD/udacity-logo.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning (RL) Cheatsheet
2 | 
3 | You are encouraged to use the [PDF file](https://github.com/udacity/rl-cheatsheet/blob/master/cheatsheet.pdf) in the repository to guide your study of RL.
4 | 
5 | If you would like to learn how to implement these algorithms, please check out Udacity's [Machine Learning Engineer Nanodegree Program](http://www.udacity.com/course/machine-learning-engineer-nanodegree--nd009).
6 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Udacity, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/.github/workflows/manual.yml:
--------------------------------------------------------------------------------
 1 | # Workflow to ensure whenever a Github PR is submitted, 
 2 | # a JIRA ticket gets created automatically. 
 3 | name: Manual Workflow
 4 | 
 5 | # Controls when the action will run. 
 6 | on:
 7 |   # Triggers the workflow on pull request events but only for the master branch
 8 |   pull_request_target:
 9 |     types: [opened, reopened]
10 | 
11 |   # Allows you to run this workflow manually from the Actions tab
12 |   workflow_dispatch:
13 | 
14 | jobs:
15 |   test-transition-issue:
16 |     name: Convert Github Issue to Jira Issue
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |     - name: Checkout
20 |       uses: actions/checkout@master
21 | 
22 |     - name: Login
23 |       uses: atlassian/gajira-login@master
24 |       env:
25 |         JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
26 |         JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
27 |         JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
28 |         
29 |     - name: Create NEW JIRA ticket
30 |       id: create
31 |       uses: atlassian/gajira-create@master
32 |       with:
33 |         project: CONUPDATE
34 |         issuetype: Task
35 |         summary: |
36 |           Github PR [Assign the ND component] | Repo: ${{ github.repository }}  | PR# ${{github.event.number}}
37 |         description: |
38 |            Repo link: https://github.com/${{ github.repository }}   
39 |            PR no. ${{ github.event.pull_request.number }} 
40 |            PR title: ${{ github.event.pull_request.title }}  
41 |            PR description: ${{ github.event.pull_request.description }}  
42 |            In addition, please resolve other issues, if any. 
43 |         fields: '{"components": [{"name":"nd013 - Self Driving Car Engineer ND"}], "customfield_16449":"https://classroom.udacity.com/", "customfield_16450":"Resolve the PR", "labels": ["github"], "priority":{"id": "4"}}'
44 | 
45 |     - name: Log created issue
46 |       run: echo "Issue ${{ steps.create.outputs.issue }} was created"
47 | 


--------------------------------------------------------------------------------
/cheatsheet.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[10pt]{amsart}
  2 | \usepackage[top=1in, bottom=1in, left=1in, right=1in]{geometry}
  3 | \geometry{letterpaper}                  
  4 | \geometry{landscape}               
  5 | %\usepackage[parfill]{parskip}    % Activate to begin paragraphs with an empty line rather than an indent
  6 | \usepackage{graphicx}
  7 | \usepackage{amssymb}
  8 | \usepackage{epstopdf}
  9 | \usepackage{tabto}
 10 | \usepackage{empheq, comment}
 11 | \usepackage[ruled]{algorithm2e}
 12 | \usepackage{fancyhdr}
 13 | \renewcommand{\headrulewidth}{0pt}
 14 | \fancyhead[L]{}
 15 | \fancyhead[R]{
 16 | \includegraphics[width=4cm]{udacity-logo.png}
 17 | }
 18 | \DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png}
 19 | \pagestyle{fancy}
 20 | 
 21 | \title{Reinforcement Learning}
 22 | 
 23 | \begin{document}
 24 | \maketitle
 25 | \thispagestyle{fancy}
 26 | 
 27 | \section{The Problem}
 28 | 
 29 | \begin{itemize}
 30 | \item[] $S_t$ \tabto{2cm} state at time $t$
 31 | \item[] $A_t$ \tabto{2cm} action at time $t$
 32 | \item[] $R_t$ \tabto{2cm} reward at time $t$
 33 | \item[] $\gamma$ \tabto{2cm} discount rate (where $0 \leq \gamma \leq 1$)
 34 | \item[] $G_t$ \tabto{2cm} discounted return at time $t$ ($\sum_{k=0}^\infty \gamma^k R_{t+k+1}$)
 35 | \item[] $\mathcal{S}$ \tabto{2cm} set of all nonterminal states
 36 | \item[] $\mathcal{S}^+$ \tabto{2cm} set of all states (including terminal states)
 37 | \item[] $\mathcal{A}$ \tabto{2cm} set of all actions 
 38 | \item[] $\mathcal{A}(s)$ \tabto{2cm} set of all actions available in state $s$
 39 | \item[] $\mathcal{R}$ \tabto{2cm} set of all rewards
 40 | \item[] $p(s',r|s,a)$ \tabto{2cm} probability of next state $s'$ and reward $r$, given current state $s$ and current action $a$ ($\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t = s, A_t = a)$)
 41 | \end{itemize}
 42 | 
 43 | \section{The Solution}
 44 | \begin{itemize}
 45 | \item[] $\pi$ \tabto{2cm} policy 
 46 | \item[] \tabto{2.5cm} \textit{if deterministic}: $\pi(s) \in \mathcal{A}(s)$ for all $s \in \mathcal{S}$ 
 47 | \item[] \tabto{2.5cm} \textit{if stochastic}: $\pi(a|s) = \mathbb{P}(A_t=a|S_t=s)$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$
 48 | \item[] $v_\pi$ \tabto{2cm} state-value function for policy $\pi$ ($v_\pi(s) \doteq \mathbb{E}[G_t|S_t=s]$ for all $s\in\mathcal{S}$)
 49 | \item[] $q_\pi$ \tabto{2cm} action-value function for policy $\pi$ ($q_\pi(s,a) \doteq \mathbb{E}[G_t|S_t=s, A_t=a]$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$)
 50 | \item[] $v_*$ \tabto{2cm} optimal state-value function ($v_*(s) \doteq \max_\pi v_\pi(s)$ for all $s \in \mathcal{S}$)
 51 | \item[] $q_*$ \tabto{2cm} optimal action-value function ($q_*(s,a) \doteq \max_\pi q_\pi(s,a)$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$)
 52 | \end{itemize}
 53 | 
 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 55 | \newpage
 56 | 
 57 | \section{Bellman Equations}
 58 | 
 59 | \subsection{Bellman Expectation Equations}
 60 | 
 61 | \begin{empheq}[box=\fbox]{align}
 62 | v_\pi(s) = \sum_{a \in \mathcal{A}(s)}\pi(a|s)\sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_\pi(s'))\nonumber
 63 | \end{empheq}
 64 | 
 65 | \begin{empheq}[box=\fbox]{align}
 66 | q_\pi(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma\sum_{a' \in \mathcal{A}(s')} \pi(a'|s') q_\pi(s',a'))\nonumber
 67 | \end{empheq}
 68 | 
 69 | \subsection{Bellman Optimality Equations}
 70 | \begin{empheq}[box=\fbox]{align}
 71 | v_*(s) = \max_{a \in \mathcal{A}(s)}\sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_*(s')) \nonumber
 72 | \end{empheq}
 73 | 
 74 | \begin{empheq}[box=\fbox]{align}
 75 | q_*(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma \max_{a'\in\mathcal{A}(s')}q_*(s',a')) \nonumber
 76 | \end{empheq}
 77 | 
 78 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 79 | 
 80 | 
 81 | \subsection{Useful Formulas for Deriving the Bellman Equations}
 82 | 
 83 | \begin{equation*}
 84 | v_\pi(s) = \sum_{a \in \mathcal{A}(s)}\pi(a|s)q_\pi(s,a) 
 85 | \end{equation*}
 86 | 
 87 | \begin{equation*}
 88 | v_*(s) = \max_{a \in \mathcal{A}(s)}q_*(s,a) 
 89 | \end{equation*}
 90 | 
 91 | \begin{equation*}
 92 | q_\pi(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_\pi(s'))
 93 | \end{equation*}
 94 | 
 95 | \begin{equation*}
 96 | q_*(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_*(s'))
 97 | \end{equation*}
 98 | 
 99 | \begin{align*}
100 | q_\pi(s,a) &\doteq \mathbb{E}_{\pi}[ G_t | S_t = s, A_t = a ] & (1)\\
101 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t=s, A_t=a)\mathbb{E}_{\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] & (2)\\
102 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)\mathbb{E}_{\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] & (3)\\
103 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)\mathbb{E}_{\pi}[ G_{t} | S_{t+1}=s', R_{t+1}=r] & (4)\\
104 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)\mathbb{E}_{\pi}[ R_{t+1} + \gamma G_{t+1} | S_{t+1}=s', R_{t+1}=r] & (5)\\
105 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r  + \gamma \mathbb{E}_\pi[G_{t+1} | S_{t+1}=s'] ) & (6)\\
106 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r  + \gamma v_\pi(s') ) & (7)
107 | \end{align*}
108 | 
109 | \vspace{.5in}
110 | 
111 | The reasoning for the above is as follows:
112 | \vspace{.2in}
113 | \begin{itemize}
114 | \item (1) by definition ($q_\pi(s,a) \doteq \mathbb{E}_{\pi}[ G_t | S_t = s, A_t = a ]$) \\
115 | \item (2) Law of Total Expectation\\
116 | \item (3) by definition ($p(s',r|s,a)\doteq\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t=s, A_t=a)$)\\
117 | \item (4) $\mathbb{E}_{\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] = \mathbb{E}_{\pi}[ G_{t} | S_{t+1}=s', R_{t+1}=r]$\\
118 | \item (5) $G_t = R_{t+1} + \gamma G_{t+1}$\\
119 | \item (6) Linearity of Expectation\\
120 | \item (7) $v_\pi(s') = \mathbb{E}_\pi[G_{t+1} | S_{t+1}=s']$
121 | \end{itemize}
122 | 
123 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
124 | \newpage
125 | 
126 | \section{Dynamic Programming}
127 | 
128 | %%%%%%%%%%%% 1 POLICY EVALUATION
129 | \begin{algorithm}
130 | 	\KwIn{MDP, policy $\pi$, small positive number $\theta$}
131 |     	\KwOut{$V \approx v_\pi$}
132 |     	Initialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \in \mathcal{S}^+$)\\
133 |     	\Repeat{$\Delta < \theta$}{
134 |     		$\Delta \leftarrow 0$\\
135 | 		\For{$s \in \mathcal{S}$}{
136 | 			$v \leftarrow V(s)$\\
137 | 			$V(s) \leftarrow \sum_{a\in\mathcal{A}(s)} \pi(a|s) \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma V(s'))$\\
138 | 			$\Delta \leftarrow \max(\Delta, |v-V(s)|)$
139 | 		}
140 |     	}
141 | 	\KwRet{$V$}
142 | 	\caption{Policy Evaluation}
143 | \end{algorithm}
144 | 
145 | %%%%%%%%%%%% 2 ESTIMATION OF ACTION VALUES
146 | \begin{algorithm}
147 | 	\KwIn{MDP, state-value function $V$}
148 |     	\KwOut{action-value function $Q$}
149 | 		
150 |     	\For{$s \in \mathcal{S}$}{
151 | 		\For{$a \in \mathcal{A}(s)$}{
152 | 		$Q(s,a) \leftarrow  \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r+\gamma V(s'))$
153 | 		}
154 | 	}
155 | 	\KwRet{$Q$}
156 | 	\caption{Estimation of Action Values}
157 | \end{algorithm}
158 | 
159 | 
160 | %%%%%%%%%%%% 3 POLICY IMPROVEMENT
161 | \begin{algorithm}
162 | 	\KwIn{MDP, value function $V$}
163 |     	\KwOut{policy $\pi'$}
164 | 		
165 |     	\For{$s \in \mathcal{S}$}{
166 | 		\For{$a \in \mathcal{A}(s)$}{
167 | 		$Q(s,a) \leftarrow  \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r+\gamma V(s'))$
168 | 		}
169 | 		$\pi'(s) \leftarrow \arg\max_{a\in\mathcal{A}(s)}Q(s,a)$
170 | 		
171 | 	}
172 | 	\KwRet{$\pi'$}
173 | 	\caption{Policy Improvement}
174 | \end{algorithm}
175 | 
176 | %%%%%%%%%%%% 4 POLICY ITERATION 
177 | \begin{algorithm}
178 | 	\KwIn{MDP, small positive number $\theta$}
179 |     	\KwOut{policy $\pi \approx \pi_*$}
180 | 	Initialize $\pi$ arbitrarily (e.g., $\pi(a|s)=\frac{1}{|\mathcal{A}(s)|}$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$)\\
181 | 	$policy\text{-}stable \leftarrow false$\\
182 | 	\Repeat{$policy\text{-}stable = true$}{
183 | 	$V \leftarrow \textbf{Policy\_Evaluation}(\text{MDP}, \pi, \theta)$\\
184 | 	$\pi' \leftarrow \textbf{Policy\_Improvement}(\text{MDP}, V)$\\
185 | 	\If{$\pi= \pi'$}{
186 | 			$policy\text{-}stable \leftarrow true$\\
187 | 	}
188 | 	$\pi \leftarrow \pi'$
189 | 	}
190 | 	\KwRet{$\pi$}
191 | 	\caption{Policy Iteration}
192 | \end{algorithm}
193 | 
194 | %%%%%%%%%%%% 5 TRUNCATED POLICY EVALUATION
195 | \begin{algorithm}
196 | 	\KwIn{MDP, policy $\pi$, value function $V$, positive integer $max\_iterations$}
197 |     	\KwOut{$V \approx v_\pi$ (if $max\_iterations$ is large enough)}
198 | 	$counter \leftarrow 0$\\
199 |     	\While{$counter < max\_iterations$}{
200 | 		\For{$s \in \mathcal{S}$}{
201 | 			$V(s) \leftarrow \sum_{a\in\mathcal{A}(s)} \pi(a|s) \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma V(s'))$\\
202 | 		}
203 | 		$counter \leftarrow counter + 1$ 
204 |     	}
205 | 	\KwRet{$V$}
206 | 	\caption{Truncated Policy Evaluation}
207 | \end{algorithm}
208 | 
209 | %%%%%%%%%%%% 6 TRUNCATED POLICY ITERATION
210 | \begin{algorithm}
211 | 	\KwIn{MDP, positive integer $max\_iterations$, small positive number $\theta$}
212 |     	\KwOut{policy $\pi \approx \pi_*$}
213 | 	Initialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \in \mathcal{S}^+$)\\
214 | 	Initialize $\pi$ arbitrarily (e.g., $\pi(a|s)=\frac{1}{|\mathcal{A}(s)|}$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$)\\
215 | 	\Repeat{$\max_{s\in\mathcal{S}}|V(s) - V_{old}(s)| < \theta$}{
216 | 	$\pi \leftarrow \textbf{Policy\_Improvement}(\text{MDP}, V)$\\
217 | 	$V_{old} \leftarrow V$\\
218 | 	$V \leftarrow \textbf{Truncated\_Policy\_Evaluation}(\text{MDP}, \pi, V, max\_iterations)$
219 | 	}
220 | 	\KwRet{$\pi$}
221 | 	\caption{Truncated Policy Iteration}
222 | \end{algorithm}
223 | 
224 | %%%%%%%%%%%% 7 VALUE ITERATION
225 | \begin{algorithm}
226 | 	\KwIn{MDP, small positive number $\theta$}
227 |     	\KwOut{policy $\pi \approx \pi_*$}
228 |     	Initialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \in \mathcal{S}^+$)\\
229 |     	\Repeat{$\Delta < \theta$}{
230 |     		$\Delta \leftarrow 0$\\
231 | 		\For{$s \in \mathcal{S}$}{
232 | 			$v \leftarrow V(s)$\\
233 | 			$V(s) \leftarrow \max_{a\in\mathcal{A}(s)}\sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma V(s'))$\\
234 | 			$\Delta \leftarrow \max(\Delta, |v-V(s)|)$
235 | 		}
236 |     	}
237 | 	$\pi \leftarrow \textbf{Policy\_Improvement}(\text{MDP}, V)$ \\
238 | 	\KwRet{$\pi$}
239 | 	\caption{Value Iteration}
240 | \end{algorithm}
241 | 
242 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
243 | \clearpage
244 | 
245 | \section{Monte Carlo Methods}
246 | 
247 | %%%%%%%%%%%% 8 FIRST-VISIT MC PREDICTION (STATE VALUES)
248 | \begin{algorithm}
249 | 	\KwIn{policy $\pi$, positive integer $num\_episodes$}
250 |     	\KwOut{value function $V$ ($\approx v_\pi$ if $num\_episodes$ is large enough)}
251 | 	Initialize $N(s) = 0$ for all $s\in\mathcal{S}$ \\
252 | 	Initialize $returns\_sum(s) = 0$ for all $s\in\mathcal{S}$ \\
253 |     	\For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{
254 |     		Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\
255 | 		\For{$t \leftarrow 0 \textbf{ to }T-1$}{
256 | 			\uIf{$S_t$ is a first visit (with return $G_t$)}{
257 | 				$N(S_t) \leftarrow N(S_t) + 1$\\
258 | 				$returns\_sum(S_t) \leftarrow returns\_sum(S_t) + G_t$
259 | 			}
260 | 		}
261 | 	}
262 | 	$V(s) \leftarrow returns\_sum(s)/N(s)$ for all $s\in\mathcal{S}$\\
263 | 	\KwRet{$V$}
264 | 	\caption{First-Visit MC Prediction (\textit{for state values})}
265 | \end{algorithm}
266 | 
267 | %%%%%%%%%%%% 9 FIRST-VISIT MC PREDICTION (ACTION VALUES)
268 | \begin{algorithm}
269 | 	\KwIn{policy $\pi$, positive integer $num\_episodes$}
270 |     	\KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)}
271 | 	Initialize $N(s,a) = 0$ for all $s\in\mathcal{S}, a\in\mathcal{A}(s)$ \\
272 | 	Initialize $returns\_sum(s,a) = 0$ for all $s\in\mathcal{S}, a\in\mathcal{A}(s)$ \\
273 |     	\For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{
274 |     		Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\
275 | 		\For{$t \leftarrow 0 \textbf{ to }T-1$}{
276 | 			\uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{
277 | 				$N(S_t, A_t) \leftarrow N(S_t, A_t) + 1$\\
278 | 				$returns\_sum(S_t, A_t) \leftarrow returns\_sum(S_t, A_t) + G_t$
279 | 			}
280 | 		}
281 | 	}
282 | 	$Q(s,a) \leftarrow returns\_sum(s,a)/N(s,a)$ for all $s\in\mathcal{S}$, $a\in\mathcal{A}(s)$\\
283 | 	\KwRet{$Q$}
284 | 	\caption{First-Visit MC Prediction (\textit{for action values})}
285 | \end{algorithm}
286 | 
287 | %%%%%%%%%%%% 10 GLIE MC CONTROL
288 | \begin{algorithm}
289 | 	\KwIn{positive integer $num\_episodes$, GLIE $\{\epsilon_i\}$}
290 |     	\KwOut{policy $\pi$ ($\approx \pi_*$ if $num\_episodes$ is large enough)}
291 | 	Initialize $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$ \\
292 | 	Initialize $N(s,a) = 0$ for all $s\in\mathcal{S}, a\in\mathcal{A}(s)$ \\
293 | 	\For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{
294 | 	$\epsilon \leftarrow \epsilon_i$\\
295 | 	$\pi \leftarrow \epsilon\text{-greedy}(Q)$\\
296 |     	Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\
297 | 	\For{$t \leftarrow 0 \textbf{ to }T-1$}{
298 | 			\uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{
299 | 				$N(S_t,A_t) \leftarrow N(S_t,A_t) + 1$\\
300 | 				$Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \frac{1}{N(S_t,A_t)}(G_t - Q(S_t, A_t))$
301 | 			}
302 | 		}
303 | 	}
304 | 	\KwRet{$\pi$}
305 | 	\caption{First-Visit GLIE MC Control}
306 | \end{algorithm}
307 | 
308 | 
309 | %%%%%%%%%%%% 11 CONSTANT-ALPHA MC CONTROL
310 | \begin{algorithm}
311 | 	\KwIn{positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$}
312 |     	\KwOut{policy $\pi$ ($\approx \pi_*$ if $num\_episodes$ is large enough)}
313 | 	Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$) \\
314 | 	\For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{
315 | 	$\epsilon \leftarrow \epsilon_i$\\
316 | 	$\pi \leftarrow \epsilon\text{-greedy}(Q)$\\
317 |     	Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\
318 | 	\For{$t \leftarrow 0 \textbf{ to }T-1$}{
319 | 			\uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{
320 | 				$Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha(G_t - Q(S_t, A_t))$
321 | 			}
322 | 		}
323 | 	}
324 | 	\KwRet{$\pi$}
325 | 	\caption{First-Visit Constant-$\alpha$ (GLIE) MC Control}
326 | \end{algorithm}
327 | 
328 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
329 | \clearpage
330 | 
331 | \section{Temporal-Difference Methods}
332 | 
333 | %%%%%%%%%%%% 12 TD(0)
334 | \begin{algorithm}
335 | 	\KwIn{policy $\pi$, positive integer $num\_episodes$}
336 |     	\KwOut{value function $V$ ($\approx v_\pi$ if $num\_episodes$ is large enough)}
337 | 	Initialize $V$ arbitrarily (e.g., $V(s) = 0$ for all $s\in\mathcal{S}^+$) \\
338 |     	\For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{
339 |     		Observe $S_0$\\
340 | 		$t\leftarrow 0$\\
341 | 		\Repeat{$S_t$ is terminal}{
342 | 		Choose action $A_t$ using policy $\pi$\\
343 | 		Take action $A_t$ and observe $R_{t+1}, S_{t+1}$\\
344 | 		$V(S_t) \leftarrow V(S_t) + \alpha (R_{t+1} + \gamma V(S_{t+1}) - V(S_t))$\\
345 | 		$t \leftarrow t+1$
346 | 		}
347 | 	}
348 | 	\KwRet{$V$}
349 | 	\caption{TD(0)}
350 | \end{algorithm}
351 | 
352 | %%%%%%%%%%%% 13 Sarsa
353 | \begin{algorithm}
354 | 	\KwIn{policy $\pi$, positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$}
355 |     	\KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)}
356 | 	Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$, and $Q(terminal\text{-}state, \cdot)=0$) \\
357 |     	\For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{
358 | 		$\epsilon \leftarrow \epsilon_i$\\
359 | 		Observe $S_0$\\
360 | 		Choose action $A_0$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\
361 | 		$t\leftarrow 0$\\
362 | 		\Repeat{$S_t$ is terminal}{
363 | 		Take action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\
364 | 		Choose action $A_{t+1}$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\
365 | 		$Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha (R_{t+1} + \gamma Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t))$\\
366 | 		$t \leftarrow t+1$
367 | 		}
368 | 	}
369 | 	\KwRet{$Q$}
370 | 	\caption{Sarsa}
371 | \end{algorithm}
372 | 
373 | %%%%%%%%%%%% 14 Q-Learning
374 | \begin{algorithm}
375 | 	\KwIn{policy $\pi$, positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$}
376 |     	\KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)}
377 | 	Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$, and $Q(terminal\text{-}state, \cdot)=0$) \\
378 |     	\For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{
379 | 		$\epsilon \leftarrow \epsilon_i$\\
380 | 		Observe $S_0$\\
381 | 		$t\leftarrow 0$\\
382 | 		\Repeat{$S_t$ is terminal}{
383 | 		Choose action $A_t$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\
384 | 		Take action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\
385 | 		$Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha (R_{t+1} + \gamma \max_{a}Q(S_{t+1}, a) - Q(S_t, A_t))$\\
386 | 		$t \leftarrow t+1$
387 | 		}
388 | 	}
389 | 	\KwRet{$Q$}
390 | 	\caption{Sarsamax (Q-Learning)}
391 | \end{algorithm}
392 | 
393 | 
394 | %%%%%%%%%%%% 15 Expected Sarsa
395 | \begin{algorithm}
396 | 	\KwIn{policy $\pi$, positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$}
397 |     	\KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)}
398 | 	Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$, and $Q(terminal\text{-}state, \cdot)=0$) \\
399 |     	\For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{
400 | 		$\epsilon \leftarrow \epsilon_i$\\
401 | 		Observe $S_0$\\
402 | 		$t\leftarrow 0$\\
403 | 		\Repeat{$S_t$ is terminal}{
404 | 		Choose action $A_t$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\
405 | 		Take action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\
406 | 		$Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha (R_{t+1} + \gamma \sum_{a}\pi(a|S_{t+1})Q(S_{t+1}, a) - Q(S_t, A_t))$\\
407 | 		$t \leftarrow t+1$
408 | 		}
409 | 	}
410 | 	\KwRet{$Q$}
411 | 	\caption{Expected Sarsa}
412 | \end{algorithm}
413 | 
414 | \end{document}  


--------------------------------------------------------------------------------