├── CODEOWNERS ├── cheatsheet.pdf ├── udacity-logo.png ├── README.md ├── LICENSE.txt ├── .github └── workflows │ └── manual.yml └── cheatsheet.tex /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @udacity/active-public-content -------------------------------------------------------------------------------- /cheatsheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/rl-cheatsheet/HEAD/cheatsheet.pdf -------------------------------------------------------------------------------- /udacity-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/rl-cheatsheet/HEAD/udacity-logo.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning (RL) Cheatsheet 2 | 3 | You are encouraged to use the [PDF file](https://github.com/udacity/rl-cheatsheet/blob/master/cheatsheet.pdf) in the repository to guide your study of RL. 4 | 5 | If you would like to learn how to implement these algorithms, please check out Udacity's [Machine Learning Engineer Nanodegree Program](http://www.udacity.com/course/machine-learning-engineer-nanodegree--nd009). 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Udacity, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /.github/workflows/manual.yml: -------------------------------------------------------------------------------- 1 | # Workflow to ensure whenever a Github PR is submitted, 2 | # a JIRA ticket gets created automatically. 3 | name: Manual Workflow 4 | 5 | # Controls when the action will run. 6 | on: 7 | # Triggers the workflow on pull request events but only for the master branch 8 | pull_request_target: 9 | types: [opened, reopened] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | jobs: 15 | test-transition-issue: 16 | name: Convert Github Issue to Jira Issue 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@master 21 | 22 | - name: Login 23 | uses: atlassian/gajira-login@master 24 | env: 25 | JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }} 26 | JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }} 27 | JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }} 28 | 29 | - name: Create NEW JIRA ticket 30 | id: create 31 | uses: atlassian/gajira-create@master 32 | with: 33 | project: CONUPDATE 34 | issuetype: Task 35 | summary: | 36 | Github PR [Assign the ND component] | Repo: ${{ github.repository }} | PR# ${{github.event.number}} 37 | description: | 38 | Repo link: https://github.com/${{ github.repository }} 39 | PR no. ${{ github.event.pull_request.number }} 40 | PR title: ${{ github.event.pull_request.title }} 41 | PR description: ${{ github.event.pull_request.description }} 42 | In addition, please resolve other issues, if any. 43 | fields: '{"components": [{"name":"nd013 - Self Driving Car Engineer ND"}], "customfield_16449":"https://classroom.udacity.com/", "customfield_16450":"Resolve the PR", "labels": ["github"], "priority":{"id": "4"}}' 44 | 45 | - name: Log created issue 46 | run: echo "Issue ${{ steps.create.outputs.issue }} was created" 47 | -------------------------------------------------------------------------------- /cheatsheet.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt]{amsart} 2 | \usepackage[top=1in, bottom=1in, left=1in, right=1in]{geometry} 3 | \geometry{letterpaper} 4 | \geometry{landscape} 5 | %\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent 6 | \usepackage{graphicx} 7 | \usepackage{amssymb} 8 | \usepackage{epstopdf} 9 | \usepackage{tabto} 10 | \usepackage{empheq, comment} 11 | \usepackage[ruled]{algorithm2e} 12 | \usepackage{fancyhdr} 13 | \renewcommand{\headrulewidth}{0pt} 14 | \fancyhead[L]{} 15 | \fancyhead[R]{ 16 | \includegraphics[width=4cm]{udacity-logo.png} 17 | } 18 | \DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png} 19 | \pagestyle{fancy} 20 | 21 | \title{Reinforcement Learning} 22 | 23 | \begin{document} 24 | \maketitle 25 | \thispagestyle{fancy} 26 | 27 | \section{The Problem} 28 | 29 | \begin{itemize} 30 | \item[] $S_t$ \tabto{2cm} state at time $t$ 31 | \item[] $A_t$ \tabto{2cm} action at time $t$ 32 | \item[] $R_t$ \tabto{2cm} reward at time $t$ 33 | \item[] $\gamma$ \tabto{2cm} discount rate (where $0 \leq \gamma \leq 1$) 34 | \item[] $G_t$ \tabto{2cm} discounted return at time $t$ ($\sum_{k=0}^\infty \gamma^k R_{t+k+1}$) 35 | \item[] $\mathcal{S}$ \tabto{2cm} set of all nonterminal states 36 | \item[] $\mathcal{S}^+$ \tabto{2cm} set of all states (including terminal states) 37 | \item[] $\mathcal{A}$ \tabto{2cm} set of all actions 38 | \item[] $\mathcal{A}(s)$ \tabto{2cm} set of all actions available in state $s$ 39 | \item[] $\mathcal{R}$ \tabto{2cm} set of all rewards 40 | \item[] $p(s',r|s,a)$ \tabto{2cm} probability of next state $s'$ and reward $r$, given current state $s$ and current action $a$ ($\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t = s, A_t = a)$) 41 | \end{itemize} 42 | 43 | \section{The Solution} 44 | \begin{itemize} 45 | \item[] $\pi$ \tabto{2cm} policy 46 | \item[] \tabto{2.5cm} \textit{if deterministic}: $\pi(s) \in \mathcal{A}(s)$ for all $s \in \mathcal{S}$ 47 | \item[] \tabto{2.5cm} \textit{if stochastic}: $\pi(a|s) = \mathbb{P}(A_t=a|S_t=s)$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$ 48 | \item[] $v_\pi$ \tabto{2cm} state-value function for policy $\pi$ ($v_\pi(s) \doteq \mathbb{E}[G_t|S_t=s]$ for all $s\in\mathcal{S}$) 49 | \item[] $q_\pi$ \tabto{2cm} action-value function for policy $\pi$ ($q_\pi(s,a) \doteq \mathbb{E}[G_t|S_t=s, A_t=a]$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$) 50 | \item[] $v_*$ \tabto{2cm} optimal state-value function ($v_*(s) \doteq \max_\pi v_\pi(s)$ for all $s \in \mathcal{S}$) 51 | \item[] $q_*$ \tabto{2cm} optimal action-value function ($q_*(s,a) \doteq \max_\pi q_\pi(s,a)$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$) 52 | \end{itemize} 53 | 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 55 | \newpage 56 | 57 | \section{Bellman Equations} 58 | 59 | \subsection{Bellman Expectation Equations} 60 | 61 | \begin{empheq}[box=\fbox]{align} 62 | v_\pi(s) = \sum_{a \in \mathcal{A}(s)}\pi(a|s)\sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_\pi(s'))\nonumber 63 | \end{empheq} 64 | 65 | \begin{empheq}[box=\fbox]{align} 66 | q_\pi(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma\sum_{a' \in \mathcal{A}(s')} \pi(a'|s') q_\pi(s',a'))\nonumber 67 | \end{empheq} 68 | 69 | \subsection{Bellman Optimality Equations} 70 | \begin{empheq}[box=\fbox]{align} 71 | v_*(s) = \max_{a \in \mathcal{A}(s)}\sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_*(s')) \nonumber 72 | \end{empheq} 73 | 74 | \begin{empheq}[box=\fbox]{align} 75 | q_*(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma \max_{a'\in\mathcal{A}(s')}q_*(s',a')) \nonumber 76 | \end{empheq} 77 | 78 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 79 | 80 | 81 | \subsection{Useful Formulas for Deriving the Bellman Equations} 82 | 83 | \begin{equation*} 84 | v_\pi(s) = \sum_{a \in \mathcal{A}(s)}\pi(a|s)q_\pi(s,a) 85 | \end{equation*} 86 | 87 | \begin{equation*} 88 | v_*(s) = \max_{a \in \mathcal{A}(s)}q_*(s,a) 89 | \end{equation*} 90 | 91 | \begin{equation*} 92 | q_\pi(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_\pi(s')) 93 | \end{equation*} 94 | 95 | \begin{equation*} 96 | q_*(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_*(s')) 97 | \end{equation*} 98 | 99 | \begin{align*} 100 | q_\pi(s,a) &\doteq \mathbb{E}_{\pi}[ G_t | S_t = s, A_t = a ] & (1)\\ 101 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t=s, A_t=a)\mathbb{E}_{\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] & (2)\\ 102 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)\mathbb{E}_{\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] & (3)\\ 103 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)\mathbb{E}_{\pi}[ G_{t} | S_{t+1}=s', R_{t+1}=r] & (4)\\ 104 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)\mathbb{E}_{\pi}[ R_{t+1} + \gamma G_{t+1} | S_{t+1}=s', R_{t+1}=r] & (5)\\ 105 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma \mathbb{E}_\pi[G_{t+1} | S_{t+1}=s'] ) & (6)\\ 106 | &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_\pi(s') ) & (7) 107 | \end{align*} 108 | 109 | \vspace{.5in} 110 | 111 | The reasoning for the above is as follows: 112 | \vspace{.2in} 113 | \begin{itemize} 114 | \item (1) by definition ($q_\pi(s,a) \doteq \mathbb{E}_{\pi}[ G_t | S_t = s, A_t = a ]$) \\ 115 | \item (2) Law of Total Expectation\\ 116 | \item (3) by definition ($p(s',r|s,a)\doteq\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t=s, A_t=a)$)\\ 117 | \item (4) $\mathbb{E}_{\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] = \mathbb{E}_{\pi}[ G_{t} | S_{t+1}=s', R_{t+1}=r]$\\ 118 | \item (5) $G_t = R_{t+1} + \gamma G_{t+1}$\\ 119 | \item (6) Linearity of Expectation\\ 120 | \item (7) $v_\pi(s') = \mathbb{E}_\pi[G_{t+1} | S_{t+1}=s']$ 121 | \end{itemize} 122 | 123 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 124 | \newpage 125 | 126 | \section{Dynamic Programming} 127 | 128 | %%%%%%%%%%%% 1 POLICY EVALUATION 129 | \begin{algorithm} 130 | \KwIn{MDP, policy $\pi$, small positive number $\theta$} 131 | \KwOut{$V \approx v_\pi$} 132 | Initialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \in \mathcal{S}^+$)\\ 133 | \Repeat{$\Delta < \theta$}{ 134 | $\Delta \leftarrow 0$\\ 135 | \For{$s \in \mathcal{S}$}{ 136 | $v \leftarrow V(s)$\\ 137 | $V(s) \leftarrow \sum_{a\in\mathcal{A}(s)} \pi(a|s) \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma V(s'))$\\ 138 | $\Delta \leftarrow \max(\Delta, |v-V(s)|)$ 139 | } 140 | } 141 | \KwRet{$V$} 142 | \caption{Policy Evaluation} 143 | \end{algorithm} 144 | 145 | %%%%%%%%%%%% 2 ESTIMATION OF ACTION VALUES 146 | \begin{algorithm} 147 | \KwIn{MDP, state-value function $V$} 148 | \KwOut{action-value function $Q$} 149 | 150 | \For{$s \in \mathcal{S}$}{ 151 | \For{$a \in \mathcal{A}(s)$}{ 152 | $Q(s,a) \leftarrow \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r+\gamma V(s'))$ 153 | } 154 | } 155 | \KwRet{$Q$} 156 | \caption{Estimation of Action Values} 157 | \end{algorithm} 158 | 159 | 160 | %%%%%%%%%%%% 3 POLICY IMPROVEMENT 161 | \begin{algorithm} 162 | \KwIn{MDP, value function $V$} 163 | \KwOut{policy $\pi'$} 164 | 165 | \For{$s \in \mathcal{S}$}{ 166 | \For{$a \in \mathcal{A}(s)$}{ 167 | $Q(s,a) \leftarrow \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r+\gamma V(s'))$ 168 | } 169 | $\pi'(s) \leftarrow \arg\max_{a\in\mathcal{A}(s)}Q(s,a)$ 170 | 171 | } 172 | \KwRet{$\pi'$} 173 | \caption{Policy Improvement} 174 | \end{algorithm} 175 | 176 | %%%%%%%%%%%% 4 POLICY ITERATION 177 | \begin{algorithm} 178 | \KwIn{MDP, small positive number $\theta$} 179 | \KwOut{policy $\pi \approx \pi_*$} 180 | Initialize $\pi$ arbitrarily (e.g., $\pi(a|s)=\frac{1}{|\mathcal{A}(s)|}$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$)\\ 181 | $policy\text{-}stable \leftarrow false$\\ 182 | \Repeat{$policy\text{-}stable = true$}{ 183 | $V \leftarrow \textbf{Policy\_Evaluation}(\text{MDP}, \pi, \theta)$\\ 184 | $\pi' \leftarrow \textbf{Policy\_Improvement}(\text{MDP}, V)$\\ 185 | \If{$\pi= \pi'$}{ 186 | $policy\text{-}stable \leftarrow true$\\ 187 | } 188 | $\pi \leftarrow \pi'$ 189 | } 190 | \KwRet{$\pi$} 191 | \caption{Policy Iteration} 192 | \end{algorithm} 193 | 194 | %%%%%%%%%%%% 5 TRUNCATED POLICY EVALUATION 195 | \begin{algorithm} 196 | \KwIn{MDP, policy $\pi$, value function $V$, positive integer $max\_iterations$} 197 | \KwOut{$V \approx v_\pi$ (if $max\_iterations$ is large enough)} 198 | $counter \leftarrow 0$\\ 199 | \While{$counter < max\_iterations$}{ 200 | \For{$s \in \mathcal{S}$}{ 201 | $V(s) \leftarrow \sum_{a\in\mathcal{A}(s)} \pi(a|s) \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma V(s'))$\\ 202 | } 203 | $counter \leftarrow counter + 1$ 204 | } 205 | \KwRet{$V$} 206 | \caption{Truncated Policy Evaluation} 207 | \end{algorithm} 208 | 209 | %%%%%%%%%%%% 6 TRUNCATED POLICY ITERATION 210 | \begin{algorithm} 211 | \KwIn{MDP, positive integer $max\_iterations$, small positive number $\theta$} 212 | \KwOut{policy $\pi \approx \pi_*$} 213 | Initialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \in \mathcal{S}^+$)\\ 214 | Initialize $\pi$ arbitrarily (e.g., $\pi(a|s)=\frac{1}{|\mathcal{A}(s)|}$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$)\\ 215 | \Repeat{$\max_{s\in\mathcal{S}}|V(s) - V_{old}(s)| < \theta$}{ 216 | $\pi \leftarrow \textbf{Policy\_Improvement}(\text{MDP}, V)$\\ 217 | $V_{old} \leftarrow V$\\ 218 | $V \leftarrow \textbf{Truncated\_Policy\_Evaluation}(\text{MDP}, \pi, V, max\_iterations)$ 219 | } 220 | \KwRet{$\pi$} 221 | \caption{Truncated Policy Iteration} 222 | \end{algorithm} 223 | 224 | %%%%%%%%%%%% 7 VALUE ITERATION 225 | \begin{algorithm} 226 | \KwIn{MDP, small positive number $\theta$} 227 | \KwOut{policy $\pi \approx \pi_*$} 228 | Initialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \in \mathcal{S}^+$)\\ 229 | \Repeat{$\Delta < \theta$}{ 230 | $\Delta \leftarrow 0$\\ 231 | \For{$s \in \mathcal{S}$}{ 232 | $v \leftarrow V(s)$\\ 233 | $V(s) \leftarrow \max_{a\in\mathcal{A}(s)}\sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma V(s'))$\\ 234 | $\Delta \leftarrow \max(\Delta, |v-V(s)|)$ 235 | } 236 | } 237 | $\pi \leftarrow \textbf{Policy\_Improvement}(\text{MDP}, V)$ \\ 238 | \KwRet{$\pi$} 239 | \caption{Value Iteration} 240 | \end{algorithm} 241 | 242 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 243 | \clearpage 244 | 245 | \section{Monte Carlo Methods} 246 | 247 | %%%%%%%%%%%% 8 FIRST-VISIT MC PREDICTION (STATE VALUES) 248 | \begin{algorithm} 249 | \KwIn{policy $\pi$, positive integer $num\_episodes$} 250 | \KwOut{value function $V$ ($\approx v_\pi$ if $num\_episodes$ is large enough)} 251 | Initialize $N(s) = 0$ for all $s\in\mathcal{S}$ \\ 252 | Initialize $returns\_sum(s) = 0$ for all $s\in\mathcal{S}$ \\ 253 | \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ 254 | Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\ 255 | \For{$t \leftarrow 0 \textbf{ to }T-1$}{ 256 | \uIf{$S_t$ is a first visit (with return $G_t$)}{ 257 | $N(S_t) \leftarrow N(S_t) + 1$\\ 258 | $returns\_sum(S_t) \leftarrow returns\_sum(S_t) + G_t$ 259 | } 260 | } 261 | } 262 | $V(s) \leftarrow returns\_sum(s)/N(s)$ for all $s\in\mathcal{S}$\\ 263 | \KwRet{$V$} 264 | \caption{First-Visit MC Prediction (\textit{for state values})} 265 | \end{algorithm} 266 | 267 | %%%%%%%%%%%% 9 FIRST-VISIT MC PREDICTION (ACTION VALUES) 268 | \begin{algorithm} 269 | \KwIn{policy $\pi$, positive integer $num\_episodes$} 270 | \KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)} 271 | Initialize $N(s,a) = 0$ for all $s\in\mathcal{S}, a\in\mathcal{A}(s)$ \\ 272 | Initialize $returns\_sum(s,a) = 0$ for all $s\in\mathcal{S}, a\in\mathcal{A}(s)$ \\ 273 | \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ 274 | Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\ 275 | \For{$t \leftarrow 0 \textbf{ to }T-1$}{ 276 | \uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{ 277 | $N(S_t, A_t) \leftarrow N(S_t, A_t) + 1$\\ 278 | $returns\_sum(S_t, A_t) \leftarrow returns\_sum(S_t, A_t) + G_t$ 279 | } 280 | } 281 | } 282 | $Q(s,a) \leftarrow returns\_sum(s,a)/N(s,a)$ for all $s\in\mathcal{S}$, $a\in\mathcal{A}(s)$\\ 283 | \KwRet{$Q$} 284 | \caption{First-Visit MC Prediction (\textit{for action values})} 285 | \end{algorithm} 286 | 287 | %%%%%%%%%%%% 10 GLIE MC CONTROL 288 | \begin{algorithm} 289 | \KwIn{positive integer $num\_episodes$, GLIE $\{\epsilon_i\}$} 290 | \KwOut{policy $\pi$ ($\approx \pi_*$ if $num\_episodes$ is large enough)} 291 | Initialize $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$ \\ 292 | Initialize $N(s,a) = 0$ for all $s\in\mathcal{S}, a\in\mathcal{A}(s)$ \\ 293 | \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ 294 | $\epsilon \leftarrow \epsilon_i$\\ 295 | $\pi \leftarrow \epsilon\text{-greedy}(Q)$\\ 296 | Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\ 297 | \For{$t \leftarrow 0 \textbf{ to }T-1$}{ 298 | \uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{ 299 | $N(S_t,A_t) \leftarrow N(S_t,A_t) + 1$\\ 300 | $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \frac{1}{N(S_t,A_t)}(G_t - Q(S_t, A_t))$ 301 | } 302 | } 303 | } 304 | \KwRet{$\pi$} 305 | \caption{First-Visit GLIE MC Control} 306 | \end{algorithm} 307 | 308 | 309 | %%%%%%%%%%%% 11 CONSTANT-ALPHA MC CONTROL 310 | \begin{algorithm} 311 | \KwIn{positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$} 312 | \KwOut{policy $\pi$ ($\approx \pi_*$ if $num\_episodes$ is large enough)} 313 | Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$) \\ 314 | \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ 315 | $\epsilon \leftarrow \epsilon_i$\\ 316 | $\pi \leftarrow \epsilon\text{-greedy}(Q)$\\ 317 | Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\ 318 | \For{$t \leftarrow 0 \textbf{ to }T-1$}{ 319 | \uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{ 320 | $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha(G_t - Q(S_t, A_t))$ 321 | } 322 | } 323 | } 324 | \KwRet{$\pi$} 325 | \caption{First-Visit Constant-$\alpha$ (GLIE) MC Control} 326 | \end{algorithm} 327 | 328 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 329 | \clearpage 330 | 331 | \section{Temporal-Difference Methods} 332 | 333 | %%%%%%%%%%%% 12 TD(0) 334 | \begin{algorithm} 335 | \KwIn{policy $\pi$, positive integer $num\_episodes$} 336 | \KwOut{value function $V$ ($\approx v_\pi$ if $num\_episodes$ is large enough)} 337 | Initialize $V$ arbitrarily (e.g., $V(s) = 0$ for all $s\in\mathcal{S}^+$) \\ 338 | \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ 339 | Observe $S_0$\\ 340 | $t\leftarrow 0$\\ 341 | \Repeat{$S_t$ is terminal}{ 342 | Choose action $A_t$ using policy $\pi$\\ 343 | Take action $A_t$ and observe $R_{t+1}, S_{t+1}$\\ 344 | $V(S_t) \leftarrow V(S_t) + \alpha (R_{t+1} + \gamma V(S_{t+1}) - V(S_t))$\\ 345 | $t \leftarrow t+1$ 346 | } 347 | } 348 | \KwRet{$V$} 349 | \caption{TD(0)} 350 | \end{algorithm} 351 | 352 | %%%%%%%%%%%% 13 Sarsa 353 | \begin{algorithm} 354 | \KwIn{policy $\pi$, positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$} 355 | \KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)} 356 | Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$, and $Q(terminal\text{-}state, \cdot)=0$) \\ 357 | \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ 358 | $\epsilon \leftarrow \epsilon_i$\\ 359 | Observe $S_0$\\ 360 | Choose action $A_0$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\ 361 | $t\leftarrow 0$\\ 362 | \Repeat{$S_t$ is terminal}{ 363 | Take action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\ 364 | Choose action $A_{t+1}$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\ 365 | $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha (R_{t+1} + \gamma Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t))$\\ 366 | $t \leftarrow t+1$ 367 | } 368 | } 369 | \KwRet{$Q$} 370 | \caption{Sarsa} 371 | \end{algorithm} 372 | 373 | %%%%%%%%%%%% 14 Q-Learning 374 | \begin{algorithm} 375 | \KwIn{policy $\pi$, positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$} 376 | \KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)} 377 | Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$, and $Q(terminal\text{-}state, \cdot)=0$) \\ 378 | \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ 379 | $\epsilon \leftarrow \epsilon_i$\\ 380 | Observe $S_0$\\ 381 | $t\leftarrow 0$\\ 382 | \Repeat{$S_t$ is terminal}{ 383 | Choose action $A_t$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\ 384 | Take action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\ 385 | $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha (R_{t+1} + \gamma \max_{a}Q(S_{t+1}, a) - Q(S_t, A_t))$\\ 386 | $t \leftarrow t+1$ 387 | } 388 | } 389 | \KwRet{$Q$} 390 | \caption{Sarsamax (Q-Learning)} 391 | \end{algorithm} 392 | 393 | 394 | %%%%%%%%%%%% 15 Expected Sarsa 395 | \begin{algorithm} 396 | \KwIn{policy $\pi$, positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$} 397 | \KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)} 398 | Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$, and $Q(terminal\text{-}state, \cdot)=0$) \\ 399 | \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ 400 | $\epsilon \leftarrow \epsilon_i$\\ 401 | Observe $S_0$\\ 402 | $t\leftarrow 0$\\ 403 | \Repeat{$S_t$ is terminal}{ 404 | Choose action $A_t$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\ 405 | Take action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\ 406 | $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha (R_{t+1} + \gamma \sum_{a}\pi(a|S_{t+1})Q(S_{t+1}, a) - Q(S_t, A_t))$\\ 407 | $t \leftarrow t+1$ 408 | } 409 | } 410 | \KwRet{$Q$} 411 | \caption{Expected Sarsa} 412 | \end{algorithm} 413 | 414 | \end{document} --------------------------------------------------------------------------------