├── .gitignore
├── .gitmodules
├── README.md
├── lecture
    ├── notes-en
    │   ├── 0-Markov-Chain.md
    │   ├── 1-Reinforcement-Learning.md
    │   ├── 10-Control-as-Inference.md
    │   ├── 11-Inverse-RL.md
    │   ├── 12-RL-wtih-Sequence-Model.md
    │   ├── 13-Transfer-Learning.md
    │   ├── 2-Policy-Gradient.md
    │   ├── 3-Actor-Critic.md
    │   ├── 4-Value-Function-Methods.md
    │   ├── 5-Model-Based-RL.md
    │   ├── 6-Exploration.md
    │   ├── 7-Offline-RL.md
    │   ├── 8-RL-Theory.md
    │   ├── 9-Generative-Model.md
    │   └── pic
    │   │   ├── LM.png
    │   │   ├── Qlearning.png
    │   │   ├── RNN.png
    │   │   ├── Transformer.png
    │   │   └── control.png
    └── notes-zh
    │   ├── 0-preliminaries.md
    │   ├── 10-optimal_control_planning.md
    │   ├── 11-model-based.md
    │   ├── 12-model-based-with-policy.md
    │   ├── 13-exploration_1.md
    │   ├── 14-exploration_2.md
    │   ├── 15-offline-RL_1.md
    │   ├── 16-offline-RL_2.md
    │   ├── 17-RL-theory.md
    │   ├── 18-vae.md
    │   ├── 19-soft-optimality.md
    │   ├── 2-imitation_learning.md
    │   ├── 20-IRL.md
    │   ├── 21-RL-LM.md
    │   ├── 22-transfer-meta.md
    │   ├── 23-challenge.md
    │   ├── 3-pytorch.md
    │   ├── 4-intro2RL.md
    │   ├── 5-policy_grad.md
    │   ├── 6-actor-critic.md
    │   ├── 7-value_func.md
    │   ├── 8-Q_learning.md
    │   ├── 9-advanced_policy_grad.md
    │   ├── CS_285_Fa23_PyTorch_Tutorial.ipynb
    │   ├── assets
    │       ├── 10-1.png
    │       ├── 11-1.png
    │       ├── 12-1.jpeg
    │       ├── 14-1.jpeg
    │       ├── 14-2.png
    │       ├── 14-3.jpeg
    │       ├── 15-1.png
    │       ├── 16-1.png
    │       ├── 17-1.jpeg
    │       ├── 18-1.png
    │       ├── 19-1.png
    │       ├── 19-2.jpeg
    │       ├── 19-3.jpeg
    │       ├── 2-1.png
    │       ├── 21-1.png
    │       ├── 21-2.jpeg
    │       ├── 22-1.png
    │       ├── 22-2.png
    │       ├── 22-3.png
    │       ├── 22-4.png
    │       ├── 23-1.jpeg
    │       ├── 23-2.png
    │       ├── 4-1.png
    │       ├── 5-1.jpeg
    │       ├── 9-1.png
    │       ├── newton.ipynb
    │       └── not_implement.png
    │   ├── change.py
    │   ├── lecture
    │   └── takeaway.md
└── tutorials
    ├── 0-intro.ipynb
    ├── assets
        └── 0-1.png
    ├── install.sh
    ├── requirements.txt
    ├── utils.py
    └── utils_0_intro.py


/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__/
2 | **/no_git/
3 | lecture/PPTs
4 | written_hw
5 | lecture/notes/*.txt
6 | 
7 | tutorials/backups/*
8 | .DS_Store


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "homework_repo"]
2 | 	path = homework_repo
3 | 	url = https://github.com/ZhaoHanHong/CS285_homework.git
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ./lecture/notes-zh/0-preliminaries.md
2 | 


--------------------------------------------------------------------------------
/lecture/notes-en/0-Markov-Chain.md:
--------------------------------------------------------------------------------
 1 | ## Markov Chain
 2 | 
 3 | $\mathcal{M}=\{\mathcal{S},\mathcal{A}, \mathcal{T}, r\}$, $r:\mathcal{S}\times\mathcal{A}\rightarrow\mathbb{R}$
 4 | 
 5 | $\mu_{t,j}=p(s_t=j),\xi_{t,k}=p(a_t=k),\mathcal{T}_{i,j,k}=p(s_{t+1}=i|s_t=j,a_t=k)$
 6 | 
 7 | $$
 8 | \mu_{t+1,i}=\sum_{i,k}\mu_{t,j}\xi_{t,k}\mathcal{T}_{i,j,k}
 9 | $$
10 | 
11 | Partially observable Markov decision process
12 | 
13 | $\mathcal{M}=\{\mathcal{S},\mathcal{A},\mathcal{O}, \mathcal{T}, \mathcal{E}, r\}$， $\mathcal{O}$ observation space, $\mathcal{E}$ emission probability $p(o_t|s_t)$
14 | 
15 | $$
16 | p_{\theta}(s_1,a_1,\dots,s_T,a_T)=p(s_1)\prod_{t=1}^T \pi_{\theta}(a_t|s_t)p(s_{t+1}|s_t,a_t)\\
17 | \theta^*=\arg\max_{\theta}\sum_{s_1,a_1,\dots,s_T,a_T}\mathbb{E}_{\tau\sim p_{\theta}}[r(\tau)]=\arg\max_{\theta}\sum_{t=1}^T\mathbb{E}_{s_t, a_t\sim p_{\theta}(s_t,a_t)}[r(s_t,a_t)]
18 | $$
19 | 
20 | stationary distribution $\mu=\mathcal{T}\mu$, infinite horizon case
21 | 
22 | $$
23 | \theta^*=\arg\max_{\theta}\frac{1}{T}\sum_{t=1}^T\mathbb{E}_{s_t, a_t\sim p_{\theta}(s_t,a_t)}[r(s_t,a_t)]\rightarrow \arg\max_{\theta}\mathbb{E}_{(s,a)\sim p_{\theta}(s,a)}[r(s,a)]
24 | $$
25 | 


--------------------------------------------------------------------------------
/lecture/notes-en/1-Reinforcement-Learning.md:
--------------------------------------------------------------------------------
 1 | ## Reinforcement learning
 2 | 
 3 | Q-function
 4 | 
 5 | $$
 6 | Q^{\pi}(s_t,a_t)=\sum_{t'=t}^T\mathbb{E}_{\tau\sim p_{\pi}}[r(\tau)|s_t,a_t]
 7 | $$
 8 | 
 9 | Value function
10 | 
11 | $$
12 | V^{\pi}(s_t)=\mathbb{E}_{a_t\sim\pi(a_t|s_t)}[Q^{\pi}(s_t,a_t)]=\mathbb{E}_{a_t\sim\pi(a_t|s_t)}[Q^{\pi}(s_t,a_t)]
13 | $$
14 | 
15 | Four types of RL algorithms
16 | 
17 | - Policy gradient: differentiate the objective function
18 | 
19 | $$
20 | \mathbb{E}_{(s,a)\sim p_{\theta}(s,a)}[r(s,a)]\quad\theta\leftarrow\theta+\alpha\nabla_{\theta}\mathbb{E}_{(s,a)\sim p_{\theta}(s,a)}[r(s,a)]
21 | $$
22 | 
23 | - Value-based: estimate value function and Q-function $Q^{\pi}(s_t,a_t)$, set $\pi(s)=\arg\max_a Q^{\pi}(s,a)$
24 | - Actor-critic: estimate value function and Q-function, then update policy $\pi(a_t|s_t)$ by $\nabla_{\theta}\mathbb{E}(Q^{\pi}(s_t,a_t))$
25 | - Model-based RL: estimate transition probability $\mathcal{T}$ and reward function $r$, use it for planning and policy optimization (Backpropagation through actions or policy)
26 | 
27 | Off policy: improve policy without generating new samples
28 | On policy: need to generate new samples even if the policy changes a little
29 | 
30 | Common assumptions: full observability, episodic learning (policy gradient methods, model-based RL methods), continuity or smoothness


--------------------------------------------------------------------------------
/lecture/notes-en/10-Control-as-Inference.md:
--------------------------------------------------------------------------------
  1 | ## Control as Inference
  2 | 
  3 | Use $O_t$ as some optimality binary variable, $O_t=1$ if $a_t$ is optimal, $O_t=0$ otherwise. We use $O_t$ to denote $O_t=1$ below
  4 | 
  5 | ![Alt Text](pic/control.png)
  6 | 
  7 | then use $p(O_t|s_t,a_t)\varpropto\exp(r(s_t,a_t))$, then
  8 | 
  9 | $$
 10 | p(\tau|O_{1:T})\varpropto p(\tau)\exp\left(\sum_{t=1}^T r(s_t,a_t)\right)
 11 | $$
 12 | 
 13 | How to do inference ?
 14 | 
 15 | 1. compute backward messages $\beta_t(s_t,a_t)=p(O_{t:T}|s_t,a_t)$
 16 | 2. compute policy $p(a_t|s_t,O_{1:T})$
 17 | 3. compute forward messages $\alpha_t(s_t)=p(s_t|O_{1:t-1})$
 18 | 
 19 | ### Backward messages
 20 | 
 21 | $$
 22 | \beta_t(s_t,a_t)=\int p(O_{t+1:T}|s_{t+1})p(s_{t+1}|s_t,a_t)p(O_t|s_t,a_t)ds_{t+1}\\
 23 | 
 24 | =p(O_t|s_t,a_t)\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t)}[\beta_{t+1}(s_{t+1})]
 25 | $$
 26 | 
 27 | and under prior $p(a_t|s_t)$
 28 | 
 29 | $$
 30 | \beta_t(s_t)=p(O_{t:T}|s_{t})=\mathbb{E}_{a_t\sim p(a_t|s_t)}[\beta_t(s_t,a_t)]
 31 | $$
 32 | 
 33 | we can solve $\beta_t(s_t,a_t)$ and $\beta_t(s_t)$ in reverse order of $t$
 34 | 
 35 | define $V_t(s_t)=\log\beta_t(s_t)$ and $Q_t(s_t,a_t)=\log\beta_t(s_t,a_t)$, we get
 36 | 
 37 | $$
 38 | V_t(s_t)=\log\mathbb{E}_{a_t\sim p(a_t|s_t)}[\exp(Q_t(s_t,a_t))]
 39 | $$
 40 | 
 41 | $V_t(s_t)\rightarrow\max_{a_t}Q_t(s_t,a_t)$ as $Q_t(s_t,a_t)$ gets larger and
 42 | 
 43 | $$
 44 | Q_t(s_t,a_t)=r(s_t,a_t)+\log\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t)}[\exp(V_{t+1}(s_{t+1}))]
 45 | $$
 46 | 
 47 | under deterministic transition, the above equation is equivalent value iteration.
 48 | 
 49 | But in stochastic transition, we estimate the expectation too optimistic due to Jensen's inequality
 50 | 
 51 | If action prior $p(a_t|s_t)$ is not uniform
 52 | 
 53 | $$
 54 | V(s_t)=\log\int\exp\left(Q(s_t,a_t)+\log p(a_t|s_t)\right)da_t
 55 | $$
 56 | 
 57 | define
 58 | 
 59 | $$
 60 | \tilde{Q}(s_t,a_t)=r(s_t,a_t)+\log p(a_t|s_t)+\log\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t)}[\exp(V_{t+1}(s_{t+1}))]
 61 | $$
 62 | 
 63 | the relation of $\tilde{Q}$ and $V$ under uniform prior is same as $Q$ and $V$ under $p(a_t|s_t)$
 64 | 
 65 | Always fold the action prior into the reward, we can assume $p(a_t|s_t)$ is uniform w.l.o.g
 66 | 
 67 | ### Policy computation
 68 | 
 69 | Assume $p(a_t|s_t)$ is uniform
 70 | 
 71 | for $t=T-1$ to 1:
 72 | 
 73 | 1. $Q_t(s_t,a_t)=r(s_t,a_t)+\log\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t)}[\exp(V_{t+1}(s_{t+1}))]=\log\beta_t(s_t,a_t)$
 74 | 2. $V_t(s_t)=\log\int\exp(Q_t(s_t,a_t))da_t=\log\beta_t(s_t)$
 75 | 
 76 | use Bayes' rule, $p(a_t|s_t,O_{1:T})=\frac{\beta_t(s_t,a_t)}{\beta_t(s_t)}p(a_t|s_t)$
 77 | 
 78 | $$
 79 | \pi(a_t|s_t)=\frac{\beta_t(s_t,a_t)}{\beta_t(s_t)}=\exp(Q_t(s_t,a_t)-V_t(s_t))
 80 | $$
 81 | 
 82 | which is analogous to Boltzmann exploration
 83 | 
 84 | ### Forward messages
 85 | 
 86 | assume $\alpha_1(s_1)=p(s_1)$ is usually known and prior $p(a_{t}|s_t)$ is uniform
 87 | 
 88 | $$
 89 | \alpha_t(s_t)=\int p(s_t,s_{t-1},a_{t-1}|O_{1:t-1})ds_{t-1}da_{t-1}\\
 90 | 
 91 | =\int p(s_t|s_{t-1},a_{t-1}, O_{1:t-1})p(a_{t-1}|s_{t-1},O_{1:t-1})p(s_{t-1}|O_{1:t-1})ds_{t-1}da_{t-1}\\
 92 | 
 93 | =\int p(s_t|s_{t-1},a_{t-1})p(a_{t-1}|s_{t-1}, O_{t-1})p(s_{t-1}|O_{1:t-1})ds_{t-1}da_{t-1}
 94 | $$
 95 | 
 96 | the final step comes from the property of Markov chain, simplify
 97 | 
 98 | $$
 99 | p(a_{t-1}|s_{t-1},O_{t-1})p(s_{t-1}|O_{1:t-1})=\frac{p(O_{t-1}|s_{t-1},a_{t-1})p(a_{t-1}|s_{t-1})}{p(O_{t-1}|s_{t-1})}\frac{p(s_{t-1}|O_{1:t-2})p(O_{t-1}|s_{t-1})}{p(O_{t-1}|O_{1:t-2})}\\
100 | 
101 | =\frac{p(O_{t-1}|s_{t-1},a_{t-1})p(a_{t-1}|s_{t-1})}{p(O_{t-1}|O_{1:t-2})}\alpha_{t-1}(s_{t-1})
102 | $$
103 | 
104 | hence
105 | 
106 | $$
107 | \alpha_t(s_t)=\int p(s_t|s_{t-1},a_{t-1})\frac{p(O_{t-1}|s_{t-1},a_{t-1})p(a_{t-1}|s_{t-1})}{p(O_{t-1}|O_{1:t-2})}\alpha_{t-1}(s_{t-1})ds_{t-1}da_{t-1}\\
108 | 
109 | \varpropto\int p(s_t|s_{t-1},a_{t-1})\exp(r(s_{t-1},a_{t-1}))\alpha_{t-1}(s_{t-1})ds_{t-1}da_{t-1}
110 | $$
111 | 
112 | similarly
113 | 
114 | $$
115 | p(s_t|O_{1:T})=\frac{p(O_{1:T}|s_t)p(s_t)}{p(O_{1:T})}=\frac{p(O_{t:T}|s_t)p(s_t,O_{1:t-1})}{p(O_{1:T})}\varpropto\beta_t(s_t)\alpha_t(s_t)
116 | $$
117 | 
118 | Intuition: high $\alpha_t(s_t)$ shows state with high probability of being reached from initial state with high reward, high $\beta_t(s_t)$ shows state with high probability of reaching goal state with high reward
119 | 
120 | ### Control as Variational Inference
121 | 
122 | Inference problem: compute $p(\tau|O_{1:T})$, marginalize and condition, we can get $p(a_t|s_t,O_{1:T})$ which is the policy and $p(s_{t+1}|s_t,a_t,O_{1:T})$
123 | 
124 | However, $p(s_{t+1}|s_t,a_t,O_{1:T})\neq p(s_{t+1}|s_t,a_t)$, which is the real transition model.
125 | 
126 | Our goal is find another transition model $q(s_{1:T},a_{1:T})$ that is close to $p(s_{1:T},a_{1:T}|O_{1:T})$ but have dynamics $p(s_{t+1}|s_t,a_t)$
127 | 
128 | $$
129 | q(s_{1:T},a_{1:T})=p(s_1)\prod_t p(s_{t+1}|s_t,a_t)q(a_t|s_t)
130 | $$
131 | 
132 | using variational lower bound
133 | 
134 | $$
135 | \log p(O_{1:T})\geq\mathbb{E}_{q(s_{1:T},a_{1:T})}[\log p(O_{1:T},s_{1:T},a_{1:T})-\log q(s_{1:T},a_{1:T})]
136 | $$
137 | 
138 | and since
139 | 
140 | $$
141 | p(O_{1:T},s_{1:T},a_{1:T})=p(s_1)\prod_t p(s_{t+1}|s_t,a_t)p(a_t|s_t)p(O_t|s_t,a_t)
142 | $$
143 | 
144 | since we assume $p(a_t|s_t)$ is uniform, we can get
145 | 
146 | $$
147 | \log p(O_{1:T})\geq\mathbb{E}_{q(s_{1:T},a_{1:T})}[\sum_t\log p(O_t|s_t,a_t)-\log q(a_t|s_t)]\\
148 | 
149 | =\sum_t\mathbb{E}_{(s_t,a_t)\sim q(s_t,a_t)}[r(s_t,a_t)+\mathcal{H}(q(a_t|s_t))]
150 | $$
151 | 
152 | our goal is maximize $p(O_{1:T})$, we can maximize the lower bound
153 | 
154 | $$
155 | q(a_T|s_T)=\arg\max_{q(a_T|s_T)} \mathbb{E}_{s_T\sim q(s_T)}[\mathbb{E}_{a_T\sim q(a_T|s_T)}[r(s_T,a_T)-\log q(a_T|s_T)]]
156 | $$
157 | 
158 | optimal when
159 | 
160 | $$
161 | q(a_T|s_T)=\frac{\exp(r(s_T,a_T))}{\int\exp(r(s_T,a_T))da_T}=\exp(Q(s_T,a_T)-V(s_T))\\
162 | 
163 | \mathbb{E}_{s_T\sim q(s_T)}[\mathbb{E}_{a_T\sim q(a_T|s_T)}[r(s_T,a_T)-\log q(a_T|s_T)]]=\mathbb{E}_{s_T\sim q(s_T)}[V(s_T)]
164 | $$
165 | 
166 | use math induction, max $q(a_t|s_t)$ in the reverse order of $t$, we know
167 | 
168 | $$
169 | \max_{q}\sum_{t=t_0}^T \mathbb{E}_{s_t\sim q(s_t)}[\mathbb{E}_{a_t\sim q(a_t|s_t)}[r(s_t,a_t)-\log q(a_t|s_t)]]= \mathbb{E}_{s_t\sim q(s_t)}[V(s_t)]
170 | $$
171 | 
172 | hence
173 | 
174 | $$
175 | q(a_t|s_t)=\arg\max_{q(a_t|s_t)} \mathbb{E}_{s_t\sim q(s_t)}[\mathbb{E}_{a_t\sim q(a_t|s_t)} [r(s_t,a_t)+\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t)}[V(s_{t+1})]-\log q(a_t|s_t)]]\\
176 | 
177 | =\exp(Q(s_t,a_t)-V(s_t))
178 | $$
179 | 
180 | where $Q(s_t,a_t)=r(s_t,a_t)+\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t)}[V(s_{t+1})]$, which is regular Bellman backup, not the one in control as inference
181 | 
182 | ### Algorithm as Inference
183 | 
184 | soft policy gradient with soft optimality: also known as maximum entropy RL
185 | 
186 | $$
187 | J(\pi)=\sum_{t=0}^{T-1}\mathbb{E}_{(s_t,a_t)\sim\rho_\pi}[r(s_t,a_t)+ \mathcal{H}(\pi(\cdot|s_t))]=\mathbb{E}_{(s,a)\sim\rho_{\pi_\theta}}[Q^\pi(s,a)+\mathcal{H}(\pi(\cdot|s))]
188 | $$
189 | 
190 | under the framework of maximum entropy RL, the soft $Q$-function is
191 | 
192 | $$
193 | Q^\pi(s_t,a_t)=r(s_t,a_t)+\sum_{i>t}\gamma^{i-t}\mathbb{E}_{(s_i,a_i)\sim\rho_\pi}[r(s_i,a_i)+\mathcal{H}(\pi(\cdot|s_i))]\\
194 | 
195 | =r(s_t,a_t)+\gamma\mathbb{E}_{s_{t+1}\sim p(\cdot|s_t,a_t)}[\mathbb{E}_{a_{t+1}\sim\pi(\cdot|s_{t+1})}[Q^\pi(s_{t+1},a_{t+1})]+\mathcal{H}(\pi(\cdot|s_{t+1}))]
196 | $$
197 | 
198 | then
199 | 
200 | $$
201 | \nabla_\theta J(\pi_\theta)=\mathbb{E}_{(s,a)\sim\rho_{\pi_\theta}}[(Q^\pi(s,a)-\log\pi_\theta(a|s)-1)\nabla_\theta\log\pi_\theta(a|s)]
202 | $$
203 | 
204 | or soft $Q$-learning
205 | 
206 | $$
207 | Q(s,a)\leftarrow r(s,a)+\gamma\mathbb{E}_{s'\sim p(\cdot|s,a), a'\sim\pi(\cdot|s')}[Q(s',a')-\log\pi(a'|s')]\\
208 | 
209 | \pi_{\text{new}}(a|s)=\argmin_\pi D_{KL}(\pi'(\cdot|s)||\frac{1}{Z}\exp Q^{\pi_{\text{old}}}(s,\cdot))
210 | $$
211 | 
212 | Benifit of soft optimality: Improve exploration, easier to finetune policy, more robustness, easy to reduce to hard optimality
213 | 


--------------------------------------------------------------------------------
/lecture/notes-en/11-Inverse-RL.md:
--------------------------------------------------------------------------------
  1 | ## Inverse Reinforcement Learning
  2 | 
  3 | Given state $s\in\mathcal{S}$, action $a\in\mathcal{A}$, transition model $p(s'|s,a)$ (optional), trajectory $\{\tau_i\}$ sampled from expert policy, learn reward function $r_\psi(s,a)$, then use it to learn $\pi^*(a|s)$
  4 | 
  5 | Linear reward function $r_\psi(s,a)=\sum_i\psi_i f_i(s,a)=\psi^T f(s,a)$, where $f_i(s,a)$ is feature function
  6 | 
  7 | If feature is important, we want to match expectation: $\mathbb{E}_{\pi^{r_\psi}}[f(s,a)]=\mathbb{E}_{\pi^*}[f(s,a)]$
  8 | 
  9 | however this is ambiguous, since different reward function can lead to same policy. One way is use maximum margin
 10 | 
 11 | $$
 12 | \max_{\psi,m} m\quad\text{ s.t. }\psi^T\mathbb{E}_{\pi^{r_\psi}}[f(s,a)]\geq\psi^T\mathbb{E}_{\pi^*}[f(s,a)]+m
 13 | $$
 14 | 
 15 | using SVM trick
 16 | 
 17 | $$
 18 | \min_{\psi,m} \frac{1}{2}||\psi||^2\quad\text{ s.t. }\psi^T\mathbb{E}_{\pi^{r_\psi}}[f(s,a)]\geq\max_{\pi\in\Pi}\psi^T\mathbb{E}_{\pi^*}[f(s,a)]+D(\pi,\pi^*)
 19 | $$
 20 | 
 21 | for distribution with much difference, we want to use the formula to increase the margin
 22 | 
 23 | Issue: no clear model of expert suboptimality, messy constrainted optimization
 24 | 
 25 | Learning the optimality variable: use $O_t$ to denote optimality, then $p(\tau|O_{1:T},\psi)\varpropto p(\tau)\exp(\sum_{t=1}^T r_\psi(s_t,a_t))$
 26 | 
 27 | $$
 28 | \max_{\psi}\frac{1}{N}\sum_{i=1}^N \log p(\tau_i|O_{1:T},\psi)=\max_\psi\frac{1}{N}\sum_{i=1}^N r_\psi(\tau_i)-\log Z
 29 | $$
 30 | 
 31 | however $Z=\int p(\tau)\exp(r_\psi(\tau))d\tau$ is intractable, but we can use Monte Carlo to estimate it
 32 | 
 33 | $$
 34 | \nabla_\psi\mathcal{L}=\frac{1}{N}\nabla_\psi r_\psi(\tau_i)-\frac{1}{Z}\int p(\tau)\exp(r_\psi(\tau))\nabla_\psi r_\psi(\tau)d\tau\\
 35 | 
 36 | =\mathbb{E}_{\tau\sim\pi^*(\tau)}[\nabla_\psi r_\psi(\tau)]-\mathbb{E}_{\tau\sim p(\tau|O_{1:T},\psi)}[\nabla_\psi r_\psi(\tau)]
 37 | $$
 38 | 
 39 | The first term estimate with expert samples, the second term is soft optimal policy under current reward
 40 | 
 41 | simplify the second term
 42 | 
 43 | $$
 44 | \mathbb{E}_{\tau\sim p(\tau|O_{1:T},\psi)}[\nabla_\psi r_\psi(\tau)]=\sum_{t=1}^T\mathbb{E}_{(s_t,a_t)\sim p(s_t,a_t|O_{1:T},\psi)}[\nabla_\psi r_\psi(s_t,a_t)]\\
 45 | 
 46 | =\sum_{t=1}^T\mathbb{E}_{s_t\sim p(s_t|O_{1:T},\psi), a_t\sim p(a_t|s_t,O_{1:T},\psi)}[\nabla_\psi r_\psi(s_t,a_t)]]
 47 | $$
 48 | 
 49 | where
 50 | 
 51 | $$
 52 | p(s_t|O_{1:T},\psi)\varpropto\alpha(s_t)\beta(s_t)\\
 53 | 
 54 | p(a_t|s_t,O_{1:T},\psi)\varpropto\pi(a_t|s_t)=\frac{\beta(s_t,a_t)}{\beta(s_t)}
 55 | $$
 56 | 
 57 | define $\mu_t(s_t)\varpropto\alpha(s_t)\beta(s_t,a_t)$ as a state-action visitation probability, then
 58 | 
 59 | $$
 60 | \mathbb{E}_{\tau\sim p(\tau|O_{1:T},\psi)}[\nabla_\psi r_\psi(\tau)]=\sum_{t=1}^T\mu_t^T\nabla_\psi r_\psi
 61 | $$
 62 | 
 63 | MaxEnt IRL algorithm:
 64 | 
 65 | 1. Given $\psi$, compute backward message $\beta(s_t,a_t)$ and forward message $\alpha(s_t)$
 66 | 2. Compute $\mu_t(s_t)\varpropto\alpha(s_t)\beta(s_t,a_t)$, then evaluate $\nabla_\psi\mathcal{L}=\frac{1}{N}\sum_{i=1}^N\nabla_\psi r_\psi(\tau_i)-\sum_{t=1}^T\mu_t^T\nabla_\psi r_\psi$
 67 | 3. Update $\psi\leftarrow\psi+\alpha\nabla_\psi\mathcal{L}$
 68 | 
 69 | In the case of $r_\psi=\psi^Tf$, it optimizes $\max_\psi\mathcal{H}(\pi^{r_\psi})$ and $\mathbb{E}_{\pi^{r_\psi}}[f]=\mathbb{E}_{\pi^*}[f]$
 70 | 
 71 | Idea: learn $p(a_t|s_t,O_{1:T},\psi)$ using max-ent RL algorithm, then use it to sample $\{\tau_j\}$ using learned policy.
 72 | 
 73 | However, it's expensive to compute train the policy each time, we can use lazy policy optimization
 74 | 
 75 | $$
 76 | \nabla_\psi\mathcal{L}=\frac{1}{N}\sum_{i=1}^N\nabla_\psi r_\psi(\tau_i)-\frac{1}{\sum_j w_j}\sum_{j=1}^M w_j\nabla_\psi r_\psi(\tau_j)
 77 | $$
 78 | 
 79 | where
 80 | 
 81 | $$
 82 | w_j=\frac{p(\tau)\exp(r_\psi(\tau_j))}{\pi(\tau_j)}=\frac{p(s_1)\prod_t p(s_{t+1}|s_t,a_t)\exp(r_\psi(s_t,a_t))}{p(s_1)\prod_t p(s_{t+1}|s_t,a_t)\pi(a_t|s_t)}=\prod_t\frac{\exp(r_\psi(s_t,a_t))}{\pi(a_t|s_t)}
 83 | $$
 84 | 
 85 | $\pi$ denotes current policy, importance sampling allows us to use samples from lazy-optimized policy to estimate the gradient of the optimal policy according to the current reward
 86 | 
 87 | In a higher level, it looks like a game, reward optimization makes demos are more likely, sample less likely, policy optimization makes samples harder to distinguish from demos
 88 | 
 89 | Reward $r_\psi$, policy $\pi_\theta$, we can use alternating optimization
 90 | 
 91 | $$
 92 | \nabla_\psi\mathcal{L}=\frac{1}{N}\sum_{i=1}^N\nabla_\psi r_\psi(\tau_i)-\frac{1}{\sum_j w_j}\sum_{j=1}^M w_j\nabla_\psi r_\psi(\tau_j)\\
 93 | 
 94 | \nabla_\theta\mathcal{L}=\frac{1}{M}\sum_{j=1}^M\nabla_\theta\log\pi_\theta(\tau_j)r_\psi(\tau_j)
 95 | $$
 96 | 
 97 | Like GAN with generator $p_\theta$ and discriminator $p_\psi$
 98 | 
 99 | $$
100 | \psi=\argmax_\psi\frac{1}{N}\sum_{x\sim p^*}\log p_\psi(x)+\frac{1}{M}\sum_{x\sim p_\theta}\log(1-p_\psi(x))\\
101 | 
102 | \theta=\argmax_\theta\frac{1}{M}\sum_{x\sim p_\theta}\log p_\psi(x)
103 | $$
104 | 
105 | best discriminator $p_\psi$ is $p_\psi(x)=\frac{p^*(x)}{p^*(x)+p_\theta(x)}$
106 | 
107 | We can treat inverse RL as a GAN, optimal policy approach $\pi_\theta(\tau)\varpropto p(\tau)\exp(r_\psi(\tau))$
108 | 
109 | $$
110 | D_\psi(\tau)=\frac{\frac{1}{Z}p(\tau)\exp(r(\tau))}{p_\theta(\tau)+\frac{1}{Z}p(\tau)\exp(r(\tau))}=\frac{\frac{1}{Z}\exp(r(\tau))}{\prod_t\pi_\theta(a_t|s_t)+\frac{1}{Z}\exp(r(\tau))}
111 | $$
112 | 
113 | then optimize $\psi$ by
114 | 
115 | $$
116 | \psi\leftarrow\argmax_\psi\mathbb{E}_{\tau\sim p^*}[\log D_\psi(\tau)]+\mathbb{E}_{\tau\sim\pi_\theta}[\log(1-D_\psi(\tau))]
117 | $$
118 | 
119 | we don't need importance sampling, since the ratio is already contained in $Z$. But $Z$ is intractable, we can optimize $Z$ w.r.t same objective (see [https://arxiv.org/pdf/1611.03852](https://arxiv.org/pdf/1611.03852))
120 | 


--------------------------------------------------------------------------------
/lecture/notes-en/12-RL-wtih-Sequence-Model.md:
--------------------------------------------------------------------------------
 1 | ## RL with Sequence Model
 2 | 
 3 | Which methods handle partial observability ?
 4 | 
 5 | Policy gradient:
 6 | 
 7 | $$
 8 | \nabla_\theta J(\theta)=\frac{1}{N}\sum_{i=1}^N\left(\sum_{t=1}^T \nabla_\theta\log\pi_\theta(a_{i,t}|o_{i,t})\right)\left(\sum_{t=1}^T r(o_{i,t},a_{i,t})\right)
 9 | $$
10 | 
11 | But using $r_{i,t}+\gamma\hat{V}(o_{i,t+1})-V(o_{i,t})$ as the advantage is incorrect, since $o$ loses property of Markov chain
12 | 
13 | Also, $Q(o,a)\leftarrow r(o,a)+\gamma\max_{a'}Q(o',a')$ is also incorrect, since $o$ is not Markov. Because past observations matter for future observations
14 | 
15 | State space models: learn a Markovian state space
16 | 
17 | $$
18 | p(z)=p(z_1)\prod_t p(z_{t+1}|z_t,a_t)\\
19 | 
20 | p_\theta(o|z)=\prod_t p(o_t|z_t)\quad q_\phi(z|o)=\prod_t q_\phi(z_t|o_{1:t})
21 | $$
22 | 
23 | This work quite well, $Q(z,a)\leftarrow r(z,a)+\gamma\max_{a'}Q(z',a')$ is correct.
24 | 
25 | Since $q_\phi(z|o_{1:t})$ depends on all the previous observations, we use RNN to model it, which means state is a function of history. We can adjust
26 | 
27 | $$
28 | Q(o_{1:t},a)\leftarrow r(o_t,a)+\gamma\max_{a'}Q(o_{1:t+1},a')
29 | $$
30 | 
31 | Deep Recurrent Q-Network (DRQN): use RNN to model $q_\phi(z|o_{1:t})$
32 | 
33 | 1. Collect $(o_t,a_t,r_t,o_{t+1})$, get history by catenate $o_{1:t-1}$ and add to replay buffer
34 | 2. Sample batch $\{(o_{1:t,i},a_{t,i},o_{t+1,i})\}$ from replay buffer
35 | 3. Update $Q$ function on batch
36 | 
37 | ### Language Model
38 | 
39 | Language models: $\pi_\theta(a|s)$ where $a$ is completion and $s$ is context/prompt/prefix. Use policy gradients
40 | 
41 | $$
42 | \nabla_\theta\mathbb{E}_{\pi_\theta(a|s)}[r(s,a)]=\frac{1}{N}\sum_{i=1}^N\frac{\pi_\theta(a_i|s)}{\bar{\pi}(a_i|s)}\nabla_\theta\log\pi_\theta(a_i|s)r(s,a_i)
43 | $$
44 | 
45 | where $r(s,a)$ is the reward of completion. We can use RNN to model $\pi_\theta(a|s)$
46 | 
47 | how to define reward ? Use reward from preference
48 | 
49 | $$
50 | p(a_1>a_2)=\frac{\exp(r(s,a_1))}{\exp(r(s,a_1))+\exp(r(s,a_2))}
51 | $$
52 | 
53 | 1. Run supervised learning to get initial $\pi_\theta(a|s)$
54 | 2. For each $s$ sample $K$ answers from $\pi_\theta(a|s)$, add to dataset $\mathcal{D}=\{(s_i,a_{i,1},\dots,a_{i,K})\}$
55 | 3. Get human to label which $a_{i,k}$ they prefer, train $r$ on labeled dataset
56 | 4. Update $\pi_\theta$ using RL with reward $r(s,a)$
57 | 
58 | Issues: human preferences are expensive, overoptimization, reward models needs to be very good
59 | 
60 | <div style="float:right;">
61 |     <img src="./pic/LM.png" alt="attention" width="300" height="200">
62 | </div>
63 | 
64 | Multi-step RL with language model:
65 | 
66 | action: bot says, observation: human says, state: the history, reward: dialogue outcome
67 | 
68 | Time step: per utterance: short horizon but huge action space, per token: simple action but long horizon
69 | 
70 | Compute value using pretrained language model
71 | 


--------------------------------------------------------------------------------
/lecture/notes-en/13-Transfer-Learning.md:
--------------------------------------------------------------------------------
  1 | ## Transfer Learning
  2 | 
  3 | Meta learning: using experience from one set of tasks for faster learning and better performance on a new task
  4 | 
  5 | Issues: Domain shift, Difference in MDP, Fine-tuning issues
  6 | 
  7 | ### Domain adaptation
  8 | 
  9 | Same network may work in simulation but not in real world -> Invariance assumption: everything that is different between domains is irrelevant
 10 | 
 11 | Formally: $p(x)$ is different between domains, but exists $z=f(x)$ such that $p(y|z)=p(y|x)$ and $p(z)$ is the same between domains
 12 | 
 13 | Use domain classifier: $D_\phi(z)$ to distinguish between domains, use adversarial loss
 14 | 
 15 | When dynamics don't match, invariance is not enough because we don't want to ignore functionally relevant differences, define $\Delta_r$ as
 16 | 
 17 | $$
 18 | \Delta_r(s_t,a_t,s_{t+1})=\left(\log p(\text{target}|s_t,a_t,s_{t+1})-\log p(\text{source}|s_t,a_t,s_{t+1})\right)\\
 19 | 
 20 | -\left(\log p(\text{target}|s_t,a_t)-\log p(\text{source}|s_t,a_t)\right)
 21 | $$
 22 | 
 23 | which answers the question: for the task of predicting whether a transition came from the source or target domain, how much better can you perform after observing $s_{t+1}$.
 24 | 
 25 | Define
 26 | 
 27 | $$
 28 | \tilde{r}(s_t,a_t,s_{t+1})=r(s_t,a_t)+\lambda\Delta_r(s_t,a_t,s_{t+1})
 29 | $$
 30 | 
 31 | then perform RL using $\tilde{r}$
 32 | 
 33 | Intuition: the more varied the training domain is, the more likely we are to generalize in zero shot to a slightly different domain
 34 | 
 35 | Randomization: randomizing physical parameters (EPOpt)
 36 | 
 37 | Contextual policies: $\pi_\theta(a|s,\omega)$ where $\omega$ is context, define augemented state space $\tilde{\mathcal{S}}=\mathcal{S}\times\Omega$
 38 | 
 39 | Goal-conditioned policies: $\pi_\theta(a|s,g)$, reward as $r(s,a,g)=\delta(s=g)$ in discrete case and $r(s,a,g)=\delta(||s-g||\leq\epsilon)$ in continuous case
 40 | 
 41 | ### Meta Learning
 42 | 
 43 | Meta learning: learn to learn, learn a policy that can quickly adapt to new tasks
 44 | 
 45 | supervised learning: $f(x)\rightarrow y$, meta learning: $f(\mathcal{D},x)\rightarrow y$
 46 | 
 47 | $$
 48 | \theta^*=\arg\min_\theta\sum_{i=1}^n\mathcal{L}(f_\theta(\mathcal{D}^{\text{train}}_i),\mathcal{D}^{\text{test}}_i)
 49 | $$
 50 | 
 51 | Meta RL:
 52 | 
 53 | $$
 54 | \theta^*=\arg\min_\theta\sum_{i=1}^n\mathbb{E}_{\pi_{\phi_i}(\tau)}[R(\tau)]
 55 | $$
 56 | 
 57 | where $\phi_i=f_\theta(\mathcal{M}_i)$, $\mathcal{M}_i$ is MDP for task $i$. Assume $\mathcal{M}_i\sim p(\mathcal{M})$
 58 | 
 59 | Meta test: sample $\mathcal{M}_{\text{test}}\sim p(\mathcal{M})$, get $\phi_{\text{test}}=f_\theta(\mathcal{M}_{\text{test}})$, then get policy $\pi_{\phi_{\text{test}}}$
 60 | 
 61 | <div style="float:right;">
 62 |     <img src="./pic/RNN.png" alt="attention" width="300" height="170">
 63 | </div>
 64 | 
 65 | Meta-RL with recurrent policy: what should $f_\theta(\mathcal{M}_i)$ do
 66 | 
 67 | 1. improve policy with experience from $\mathcal{M}_i$
 68 | 2. choose how to interact, how to explore
 69 | 
 70 | Gradient-based meta-RL: $f_\theta(\mathcal{M}_i)$ can be an RL algorithm, such as
 71 | 
 72 | $$
 73 | f_\theta(\mathcal{M}_i)=\theta+\alpha\nabla_\theta J_i(\theta)
 74 | $$
 75 | 
 76 | where $J_i$ requires interacting with $\mathcal{M}_i$ to estimate $\nabla_\theta\mathbb{E}_{\pi_\theta}[R(\tau)]$ e.g.
 77 | 
 78 | $$
 79 | \theta'_i=\theta-\alpha\sum_{(x,y)\in\mathcal{D}_i^{\text{train}}}\nabla_\theta\mathcal{L}_i(f_\theta(x),y)\\
 80 | 
 81 | \theta\leftarrow\theta-\beta\nabla_\theta\sum_{i}\mathcal{L}_i(f_{\theta'_i}(\mathcal{D}_i^{\text{test}}),\mathcal{D}_i^{\text{test}})
 82 | $$
 83 | 
 84 | final output network $f_{\text{MAML}}(\mathcal{D}^{\text{train},x})=f_{\theta'}(x)$
 85 | 
 86 | Meta-RL as partially observed RL: $\tilde{\mathcal{M}}=\{\tilde{\mathcal{S}},\mathcal{A},\tilde{\mathcal{O}},\tilde{\mathcal{P}},\mathcal{E},r\}$ where $\tilde{\mathcal{S}}=\mathcal{S}\times\mathcal{Z}$, $\tilde{\mathcal{O}}=\mathcal{S}$.
 87 | 
 88 | $z$ is encapsulates information policy needs to solve current task. Learning a task equals inferring $z$
 89 | 
 90 | Solving POMDP $\tilde{\mathcal{M}}$ is equivalent to solving meta-learning.
 91 | 
 92 | Exploring via posterior sampling with latent context
 93 | 
 94 | 1. sample $z\sim\hat{p}(z_t|s_{1:t},a_{1:t},r_{1:t})$, use variational inference to approximate $q_\phi\sim\hat{p}$
 95 | 2. act according to $\pi(a|s,z)$ to collect more data
 96 | 
 97 | $$
 98 | (\theta,\phi)=\arg\max_{\theta,\phi}\frac{1}{N}\sum_{i=1}^N\mathbb{E}_{z\sim q_\phi, \tau\sim\pi_\theta}[R(\tau)-D_{KL}(q(z|s_{1:t},a_{1:t},r_{1:t})||p(z))]
 99 | $$
100 | 
101 | similar to RNN meta-RL, but with stochastic $z$ which enables exploration
102 | 
103 | Conclusion: three perspectives of meta-RL
104 | 
105 | 1. just RNN: conceptually simple, easy to apply, but vulnerable to overfitting and challenging to optimize
106 | 2. bi-level optimization: good consistency, conceptually elegant, but requires many samples
107 | 3. inference: simple exploration, elegent reduction to POMDP,  but vulnerable to overfitting and challenging to optimize
108 | 


--------------------------------------------------------------------------------
/lecture/notes-en/2-Policy-Gradient.md:
--------------------------------------------------------------------------------
  1 | ## Policy gradient
  2 | 
  3 | learning objective
  4 | 
  5 | $$
  6 | J(\theta)=\mathbb{E}_{(s,a)\sim p_{\theta}(s,a)}[r(s,a)]=\int p_{\theta}(\tau)r(\tau)d\tau\\
  7 | 
  8 | \nabla_{\theta}J(\theta)=\int\nabla_{\theta}p_{\theta}(\tau)r(\tau)d\tau=\mathbb{E}_{\tau\sim p_{\theta}}[\nabla_{\theta}\log p_{\theta}(\tau)r(\tau)]\\
  9 | 
 10 | =\mathbb{E}_{\tau\sim p_{\theta}}[\left(\sum_{t=1}^T\nabla_{\theta}\log\pi_{\theta}(a_t|s_t)\right)\left(\sum_{t=1}^T r(s_{t},a_{t})\right)]
 11 | $$
 12 | 
 13 | compute gradients by sampling
 14 | 
 15 | To reduce variance, subtract a baseline from reward (don't affect the expectation because $\mathbb{E}_{\tau\sim p_{\theta}}[\nabla_{\theta}\log\pi_{\theta}(a_t|s_t)]=0$), which is expected reward
 16 | 
 17 | $$
 18 | b=\frac{\mathbb{E}_{\tau\sim p_{\theta}}[\left(\sum_{t=1}^T r(s_{t},a_{t})\right)\left(\sum_{t=1}^T\nabla_{\theta}\log\pi_{\theta}(a_t|s_t)\right)^2]}{\mathbb{E}_{\tau\sim p_{\theta}}[\left(\sum_{t=1}^T\nabla_{\theta}\log\pi_{\theta}(a_t|s_t)\right)^2]}
 19 | $$
 20 | 
 21 | However, the baseline is hard to compute, most of time we don't use such way
 22 | 
 23 | Causality: policy at time $t'$ can't affect reward at time $t<t'$, $\log\pi_{\theta}(a_t|s_t)$ shouldn't be affected by $r(s_{t'},a_{t'})$ for $t'<t$
 24 | 
 25 | $$
 26 | \nabla_{\theta}J(\theta)=\mathbb{E}_{\tau\sim p_{\theta}}[\left(\sum_{t=1}^T\nabla_{\theta}\log\pi_{\theta}(a_t|s_t)\left(\sum_{t'=t}^T r(s_{t'},a_{t'})\right)\right)]
 27 | $$
 28 | 
 29 | this is the same as the original equation because
 30 | 
 31 | $$
 32 | \mathbb{E}_{\tau\sim p_{\theta}}[\sum_{t=1}^T\nabla_{\theta}\log\pi_{\theta}(a_t|s_t)\left(\sum_{t'<t}r(s_{t'},a_{t'})\right)]=0
 33 | $$
 34 | 
 35 | we only use causality to reduce variance
 36 | 
 37 | Off-policy with importance sampling:
 38 | 
 39 | $$
 40 | \nabla_{\theta'}J(\theta')=\mathbb{E}_{\tau\sim p_{\theta}}[\frac{p_{\theta'}(\tau)}{p_{\theta}(\tau)}\left(\sum_{t=1}^T\nabla_{\theta'}\log\pi_{\theta'}(a_t|s_t)\left(\sum_{t'=t}^T r(s_{t'},a_{t'})\right)\right)]\\
 41 | 
 42 | =\mathbb{E}_{\tau\sim p_{\theta}}\left[\left(\sum_{t=1}^T\nabla_{\theta'}\log\pi_{\theta'}(a_t|s_t)\right)\left(\prod_{t'=1}^T\frac{\pi_{\theta'}(a_{t'}|s_{t'})}{\pi_{\theta}(a_{t'}|s_{t'})}\right)\left(\sum_{t'=t}^T r(s_{t'},a_{t'})\right)\right]
 43 | $$
 44 | 
 45 | ignore reward at time $t'$ when computing gradient at time $t>t'$
 46 | 
 47 | $$
 48 | \nabla_{\theta'}J(\theta')=\mathbb{E}_{\tau\sim p_{\theta}}\left[\left(\sum_{t=1}^T\nabla_{\theta'}\log\pi_{\theta'}(a_t|s_t)\right)\left(\prod_{t'=1}^t\frac{\pi_{\theta'}(a_{t'}|s_{t'})}{\pi_{\theta}(a_{t'}|s_{t'})}\right)\left(\sum_{t'=t}^T r(s_{t},a_{t})\prod_{t''=t}^{t'}\frac{\pi_{\theta'}(a_{t''}|s_{t''})}{\pi_{\theta}(a_{t''}|s_{t''})}\right)\right]
 49 | $$
 50 | 
 51 | this equation is not trivial to compute
 52 | 
 53 | we can ignore the last term $\prod_{t''=t}^{t'}\frac{\pi_{\theta'}(a_{t''}|s_{t''})}{\pi_{\theta}(a_{t''}|s_{t''})}$, get a policy iteration algorithm. However, $\prod_{t'=1}^t\frac{\pi_{\theta'}(a_{t'}|s_{t'})}{\pi_{\theta}(a_{t'}|s_{t'})}$ might be exponentially small, since when sampling from $\pi_{\theta}$, $\pi_{\theta}(a_{t'}|s_{t'})$ is likely to be large.
 54 | 
 55 | First-order approximation of IS: sample $(s_{i,t},a_{i,t})$ from $\pi_{\theta}(s_t,a_t)$, then
 56 | 
 57 | $$
 58 | \nabla_{\theta}J(\theta)\approx\frac{1}{N}\sum_{i=1}^N\sum_{t=1}^T\frac{\pi_{\theta'}(s_{i,t},a_{i,t})}{\pi_\theta (s_{i,t},a_{i,t})}\nabla_{\theta'}\log\pi_{\theta'}(a_{i,t}|s_{i,t})\hat{Q}_{i,t}\\
 59 | 
 60 | \approx\frac{1}{N}\sum_{i=1}^N\sum_{t=1}^T\frac{\pi_{\theta'}(a_{i,t}|s_{i,t})}{\pi_\theta (a_{i,t}|s_{i,t})}\nabla_{\theta'}\log\pi_{\theta'}(a_{i,t}|s_{i,t})\hat{Q}_{i,t}
 61 | $$
 62 | 
 63 | You can see the intuition  [here](https://github.com/Hidden-Hyperparameter/RL-notes/blob/master/lecture/notes/5-policy_grad.md#with-first-order-approximation)
 64 | 
 65 | Rescale the gradient to avoid large jump
 66 | 
 67 | $$
 68 | \theta'\leftarrow\arg\max_{\theta'}(\theta'-\theta)^T\nabla_{\theta}J(\theta)\quad\text{s.t.}\quad D_{KL}(\pi_{\theta'},\pi_{\theta})\leq\epsilon
 69 | $$
 70 | 
 71 | Define fisher information matrix $F(\theta)=\mathbb{E}_{\pi_{\theta}}[\nabla_{\theta}\log\pi_{\theta}(a|s)\nabla_{\theta}\log\pi_{\theta}(a|s)^T]$, then $D_{KL}(\pi_{\theta'},\pi_{\theta})=\frac{1}{2}(\theta'-\theta)^TF(\theta)(\theta'-\theta)$, using Lagrange multiplier, we get
 72 | 
 73 | $$
 74 | \theta'\leftarrow\theta+\alpha^j \sqrt{\frac{2\epsilon}{\nabla_\theta J(\theta)^TF(\theta)^{-1}\nabla_\theta J(\theta)}} F(\theta)^{-1}\nabla_{\theta}J(\theta)
 75 | $$
 76 | 
 77 | and we can choose $j$ by backtracking line search, which is the smallest value that satisfies the constraint and improves the objective function
 78 | 
 79 | #### Trust region policy optimization (TRPO):
 80 | 
 81 | define $\eta(\tilde{\pi})=\mathbb{E}_{\tau\sim\tilde{\pi}}[\sum_{t=0}^{\infty}\gamma^t r(s_t,a_t)]$, which we want to maximize, then define
 82 | 
 83 | $$
 84 | A_{\pi}(s,a)=Q_{\pi}(s,a)-V_{\pi}(s)=\mathbb{E}_{s'\sim\mathcal{T}(s|s,a)}[r(s,a)+\gamma V_{\pi}(s')-V_{\pi}(s)]
 85 | $$
 86 | 
 87 | which represents the advantage of taking action $a$ at state $s$ under policy $\pi$, then
 88 | 
 89 | $$
 90 | \mathbb{E}_{s,a\sim\tilde{\pi}}[\sum_{t=0}^{\infty}\gamma^t A_{\pi}(s_t,a_t)]=\mathbb{E}_{s,a\sim\tilde{\pi}}[\sum_{t=0}^{\infty}\gamma^t r(s_t,a_t)]-\mathbb{E}_{s_0\sim p(s_0)}[V_{\pi}(s_0)]\\
 91 | 
 92 | =\eta(\tilde{\pi})-\eta(\pi)
 93 | $$
 94 | 
 95 | so
 96 | 
 97 | $$
 98 | \eta(\tilde{\pi})=\eta(\pi)+\mathbb{E}_{s,a\sim\tilde{\pi}}[\sum_{t=0}^{\infty}\gamma^t A_{\pi}(s_t,a_t)]\\
 99 | 
100 | =\eta(\pi)+\sum_{t=0}^{\infty}\sum_s\gamma^t P(s_t=s|\tilde{\pi})\sum_a\tilde{\pi}(a|s)A_{\pi}(s,a)
101 | $$
102 | 
103 | define weighted sum of state probablity $\rho_{\tilde\pi}(s)=\sum_{t=0}^{\infty}\gamma^t P(s_t=s|\tilde{\pi})$, then
104 | 
105 | $$
106 | \eta(\tilde{\pi})=\eta(\pi)+\sum_s\rho_{\tilde\pi}(s)\sum_a\tilde{\pi}(a|s)A_{\pi}(s,a)\\
107 | 
108 | =\eta(\pi)+\frac{1}{1-\gamma}\mathbb{E}_{s\sim\rho_{\tilde\pi},a\sim\tilde{\pi}}[A_{\pi}(s,a)]
109 | $$
110 | 
111 | Two tricks:
112 | 
113 | 1. estimate $\rho_{\tilde\pi}(s)$ using $\rho_{\pi}(s)$, because the difference between $\pi$ and $\tilde{\pi}$ is small
114 | 2. importance sampling (first order approximation)
115 | 
116 | $$
117 | \eta(\tilde{\pi})\approx\eta(\pi)+\frac{1}{1-\gamma}\mathbb{E}_{s\sim\rho_{\pi},a\sim\pi}\left[\frac{\tilde{\pi}_\theta(a|s)}{\pi_\theta (a|s)}A_{\pi}(s,a)\right]=L_{\pi}(\tilde{\pi})
118 | $$
119 | 
120 | we can discover $L_\pi(\tilde{\pi})$ and $\eta(\tilde{\pi})$ have same value and gradient at $\tilde{\pi}=\pi$, and define $\epsilon=\max_{s,a}|A_{\pi}(s,a)|$, then
121 | 
122 | $$
123 | \eta(\tilde{\pi})\geq L_{\pi}(\tilde{\pi})-\frac{4\epsilon\gamma}{(1-\gamma)^2}D^{\max}_{KL}(\pi,\tilde{\pi})
124 | $$
125 | 
126 | where max is over all state $s$, i.e. $D^{\max}_{KL}(\pi,\tilde{\pi})=\max_s D_{KL}(\pi(\cdot|s),\tilde{\pi}(\cdot|s))$, we can prove optimize the lower bound is equivalent to optimize the original objective function, we can simply use KL-divergence to estimate it. Our optimization problem becomes
127 | 
128 | $$
129 | \max_{\tilde{\pi}}\left[\mathbb{E}_{s\sim\rho_{\pi},a\sim\pi}\left[\frac{\tilde{\pi}_\theta(a|s)}{\pi_\theta (a|s)}A_{\pi}(s,a)\right]-\frac{4\epsilon\gamma}{1-\gamma}D^{\max}_{KL}(\pi,\tilde{\pi})\right]
130 | $$
131 | 
132 | if we use penalty coefficient $\frac{4\epsilon\gamma}{1-\gamma}$, then we can find our step size is very small, so we can use a constraint optimization problem
133 | 
134 | $$
135 | \max_{\tilde{\pi}}\mathbb{E}_{s\sim\rho_{\pi},a\sim\pi}\left[\frac{\tilde{\pi}_\theta(a|s)}{\pi_\theta (a|s)}A_{\pi}(s,a)\right]\quad\text{s.t.}\quad D^{\max}_{KL}(\pi,\tilde{\pi})\leq\delta
136 | $$
137 | 
138 | but since $D^{\max}_{KL}(\pi,\tilde{\pi})=\max_s D_{KL}(\pi(\cdot|s),\tilde{\pi}(\cdot|s))$, the number of constraints is infinite, we can simplify it by
139 | 
140 | $$
141 | \max_{\tilde{\pi}}\mathbb{E}_{s\sim\rho_{\pi},a\sim\pi}\left[\frac{\tilde{\pi}_\theta(a|s)}{\pi_\theta (a|s)}A_{\pi}(s,a)\right]\quad\text{s.t.}\quad D^{\rho_\pi}_{KL}(\pi,\tilde{\pi})\leq\delta
142 | $$
143 | 
144 | then see the optimization step above to get the estimate of $D^{\rho_\pi}_{KL}(\pi,\tilde{\pi})$, or maximize
145 | 
146 | $$
147 | \mathcal{L}(\theta',\lambda)=\mathbb{E}_{s\sim\rho_{\pi},a\sim\pi}\left[\frac{\tilde{\pi}_\theta(a|s)}{\pi_\theta (a|s)}A_{\pi}(s,a)\right]-\lambda(D^{\rho_\pi}_{KL}(\pi,\tilde{\pi})-\delta)
148 | $$
149 | 
150 | we maximize $\mathcal{L}(\theta',\lambda)$ w.r.t. $\theta'$ and then update $\lambda\leftarrow\lambda+\alpha(D_{KL}(\pi,\tilde{\pi})-\delta)$. After that, we update value function, and repeat the process.
151 | 
152 | Intuition: raise $\lambda$ if constraint is violated too much, else lower it
153 | 
154 | #### Proximal policy optimization (PPO):
155 | 
156 | Clip the ratio of new policy and old policy
157 | 
158 | $$
159 | L_{\text{clip}}(\theta)=\mathbb{E}_{s,a\sim\pi_{\theta}}\left[\min\left(\frac{\pi_{\theta'}(a|s)}{\pi_{\theta}(a|s)}A_{\pi_{\theta}}(s,a),\text{clip}(\frac{\pi_{\theta'}(a|s)}{\pi_{\theta}(a|s)},1-\epsilon,1+\epsilon)A_{\pi_{\theta}}(s,a)\right)\right]
160 | $$
161 | 
162 | Adaptive KL Penalty Coefficient: $D_{KL}$ too small, reduce $\beta$, $D_{KL}$ too large, increase $\beta$
163 | 
164 | $$
165 | L_{\text{penalty}}(\theta)=\mathbb{E}_{s,a\sim\pi_{\theta}}\left[\frac{\pi_{\theta'}(a|s)}{\pi_{\theta}(a|s)}A_{\pi_{\theta}}(s,a)-\beta D_{KL}(\pi_{\theta},\pi_{\theta'})\right]
166 | $$
167 | 


--------------------------------------------------------------------------------
/lecture/notes-en/3-Actor-Critic.md:
--------------------------------------------------------------------------------
 1 | ## Actor-Critic
 2 | 
 3 | lower variance due to critic, but not unbiased if critic is not perfect
 4 | 
 5 | $$
 6 | \nabla_{\theta}J(\theta)=\mathbb{E}_{\tau\sim p_{\theta}}[\left(\sum_{t=1}^T\nabla_{\theta}\log\pi_{\theta}(a_t|s_t)A^{\pi}(s_{t},a_{t})\right)]
 7 | $$
 8 | 
 9 | just fit $V^{\pi}(s)$ to estimate $A_{\pi}(s,a)=Q_{\pi}(s,a)-V_{\pi}(s)=\mathbb{E}_{s'\sim\mathcal{T}(s,a)}[r(s,a)+\gamma V_{\pi}(s')-V_{\pi}(s)]$ with parameters $\phi$, with supervised learning
10 | 
11 | $$
12 | \mathcal{L}(\phi)=\mathbb{E}_{s\sim\pi_{\theta}}[(\hat{V}_\phi^\pi(s_i)-y_i)^2]
13 | $$
14 | 
15 | where $y_i$ is reward from state $s_i$ to the end of the trajectory
16 | 
17 | batch actor-critic algorithm:
18 | 
19 | 1. sample a batch of trajectories $\tau=\{s_i,a_i\}$ from $\pi_{\theta}(a|s)$
20 | 2. use the batch to update $\hat{V}_\phi^\pi(s)$ by $\nabla_\phi\mathcal{L}(\phi)$
21 | 3. evaluate $\hat{A}^{\pi}(s,a)=r(s,a)+\gamma\hat{V}_\phi^\pi(s')-\hat{V}_\phi^\pi(s)$
22 | 4. update $\theta$ by $\nabla_{\theta}J(\theta)$
23 | 
24 | To simplify calculation $y_i\approx r(s_i,a_i)+\gamma\hat{V}_\phi^\pi(s_{i+1})$
25 | 
26 | online actor-critic algorithm, update $\phi$ and $\theta$ after each sampled trajectory:
27 | 
28 | 1. sample a trajectory $\tau=\{s_i,a_i,s',r\}$ from $\pi_{\theta}(a|s)$
29 | 2. update $\hat{V}_\phi^\pi(s)$ using target $r+\gamma\hat{V}_\phi^\pi(s')$
30 | 3. evaluate $\hat{A}^{\pi}(s,a)=r(s,a)+\gamma\hat{V}_\phi^\pi(s')-\hat{V}_\phi^\pi(s)$
31 | 4. update $\theta$ by $\nabla_{\theta}J(\theta)=\nabla_{\theta}\log\pi_{\theta}(a|s)\hat{A}^{\pi}(s,a)$
32 | 
33 | off-policy actor-critic algorithm:
34 | 
35 | 1. take action $a\sim\pi_{\theta}(a|s)$, get $(s,a,s',r)$ store in $\mathcal{R}$
36 | 2. sample a batch of trajectories $\tau=\{s_i,a_i,s'_i,r_i\}$ from $\mathcal{R}$
37 | 3. update $\hat{Q}_\phi^\pi(s_i,a_i)$ using target $r+\gamma\hat{Q}_\phi^\pi(s'_i,a'_i)$ for each $s_i,a_i$, $a'_i\sim\pi_{\theta}(a'_i|s'_i)$
38 | 4. update $\theta$ by $\nabla_{\theta}J(\theta)=\frac{1}{N}\sum_{i=1}^N\nabla_{\theta}\log\pi_{\theta}(a_i^{\pi}|s_i)\hat{Q}_\phi^\pi(s_i,a_i^{\pi})$ where $a_i^{\pi}=\pi_\theta(a|s_i)$
39 | 
40 | Changes:
41 | 
42 | 1. The reason why we use $\hat{Q}_\phi^\pi(s_i,a_i)$ instead of $\hat{V}_\phi^\pi(s_i)$ is estimating $V$ using target $r+\gamma\hat{V}_\phi^\pi(s')$ is not accurate, since $s_i'$ is not the state after taking action $a_i^{\pi}$, but the state after taking action $a_i^{\pi_{\theta_{\text{old}}}}$
43 | 2. Advantage function here is $\hat{A}^{\pi}(s,a)=\hat{Q}_\phi^\pi(s_i,a_i)-\mathbb{E}_{a\sim\pi_{\theta}}[\hat{Q}_\phi^\pi(s_i,a)]$, but computing the expectation is hard, so we use $\hat{Q}_\phi^\pi(s_i,a_i)$ instead. (use more data to reduce the variance instead of baseline)
44 | 3. It's worth noting that step 4 here sample trajectories from $\mathcal{R}$, not from $\pi_{\theta}$. The policy we get above is an optimal solution on a boarder distribution, not on $\pi_{\theta}$, but we trust the generalization ability of neural network
45 | 
46 | To keep estimator unbiased, pick **state-dependent baseline** $V_\phi^\pi(s)$
47 | 
48 | $$
49 | \nabla_{\theta}J(\theta)=\frac{1}{N}\sum_{i=1}^N\sum_{t=1}^T\nabla_{\theta}\log\pi_{\theta}(a_i^{\pi}|s_i)\left(\left(\sum_{t'=t}^T\gamma^{t'-t}r(s_{i,t'},a_{i,t'})\right)-V_\phi^\pi(s_{i,t})\right)
50 | $$
51 | 
52 | or **action-dependent baseline** $Q_\phi^\pi(s,a)$, provide second term to keep unbiased and reduce variance (since we can sample from $\pi_{\theta}$ efficiently), we only need a few samples to compute the first term.
53 | 
54 | $$
55 | \nabla_{\theta}J(\theta)=\frac{1}{N}\sum_{i=1}^N\sum_{t=1}^T\nabla_{\theta}\log\pi_{\theta}(a_i^{\pi}|s_i)\left(\hat{Q}_{i,t}-Q_\phi^\pi(s_{i,t},a_{i,t})\right)+\frac{1}{N}\sum_{i=1}^N\sum_{t=1}^T\nabla_{\theta}\mathbb{E}_{a\sim\pi_{\theta}(\cdot|s_{i,t})}[Q_\phi^\pi(s_{i,t},a)]
56 | $$
57 | 
58 | n-step returns: control bias and variance trade-off
59 | 
60 | $$
61 | \hat{A}_n^{\pi}(s_t,a_t)=\sum_{t'=t}^{t+n-1}\gamma^{t'-t}r(s_{t'},a_{t'})+\gamma^n V_\phi^\pi(s_{t+n})-V_\phi^\pi(s_t)
62 | $$
63 | 
64 | Generalized Advantage estimation: cut everywhere all at once, use weighted combination of n-step returns, pick weight $w_n=(1-\lambda)\lambda^{n-1}$
65 | 
66 | $$
67 | \hat{A}_{\text{GAE}}^{\pi}(s_t,a_t)=\sum_{n=1}^{\infty}w_n\hat{A}_n^{\pi}(s_t,a_t)=(1-\lambda)\sum_{n=1}^{\infty}\lambda^{n-1}\left(\sum_{t'=t}^{t+n-1}\gamma^{t'-t}r(s_{t'},a_{t'})+\gamma^n V_\phi^\pi(s_{t+n})-V_\phi^\pi(s_t)\right)\\
68 | 
69 | =(1-\lambda)\sum_{t'=t}^\infty\gamma^{t'-t}r(s_{t'},a_{t'})\sum_{n=t'-t+1}^\infty\lambda^{n-1}+\sum_{t'=t}^\infty(\gamma\lambda)^{t'-t}(\gamma V_\phi^\pi(s_{t'+1})-V_\phi^\pi(s_{t'}))\\
70 | 
71 | =\sum_{t'=t}^\infty(\gamma\lambda)^{t'-t}(r(s_{t'},a_{t'})+\gamma V_\phi^\pi(s_{t'+1})-V_\phi^\pi(s_{t'}))=\sum_{t'=t}^\infty(\gamma\lambda)^{t'-t}A^{\pi}(s_{t'},a_{t'})
72 | $$
73 | 
74 | which have a good interpretation
75 | 


--------------------------------------------------------------------------------
/lecture/notes-en/4-Value-Function-Methods.md:
--------------------------------------------------------------------------------
  1 | ## Value Function Methods
  2 | 
  3 | $A^{\pi}(s,a)$ measures how much better is $a_t$ than the average action at state $s_t$.
  4 | 
  5 | Forget policies and just do $\arg\max_a A^\pi(s,a)$ !
  6 | 
  7 | Bootstrap update: update value function using value function itself, then set $\pi'$ to be greedy policy w.r.t. $V^{\pi}_{\phi}$
  8 | 
  9 | $$
 10 | V^{\pi}_{\phi}(s_t)\leftarrow r(s,\pi(s))+\gamma\mathbb{E}_{s'\sim p(s'|s,\pi(s))}[V^{\pi}_{\phi}(s')]
 11 | $$
 12 | 
 13 | Even simpler, use Q-function
 14 | 
 15 | $$
 16 | Q^{\pi}(s,a)=r(s,a)+\gamma\mathbb{E}_{s'\sim p(s'|s,a)}[V^{\pi}(s')]
 17 | $$
 18 | 
 19 | and set $V^{\pi}(s)=\mathbb{E}_{a\sim\pi(a|s)}[Q^{\pi}(s,a)]=\max_a Q^{\pi}(s,a)$. However, it's impossible to represent $V(s)$ using a big table, instead we use parameters $\phi$
 20 | 
 21 | fitted value iteration:
 22 | 
 23 | $$
 24 | y_i\leftarrow\max_{a_i} (r(s_i,a_i)+\gamma\mathbb{E}_{s'_i\sim p(s'|s_i,a_i)}V_{\phi}(s_i'))\\
 25 | 
 26 | \phi\leftarrow\arg\min_{\phi}\sum_i(y_i-V_{\phi}(s_i))^2
 27 | $$
 28 | 
 29 | The problem is that we must interact with the environment with enough samples to get $y_i$, so we can estimate $Q$ function
 30 | 
 31 | full fitted Q-iteration: $Q_{\phi}(s,a)=r(s,a)+\gamma\mathbb{E}_{s'\sim p(s'|s,a)}\max_{a'}Q_{\phi}(s',a')$, take max before expectation to improve sample efficiency
 32 | 
 33 | 1. collect a batch of transitions $\tau=\{s_i,a_i,s_i',r_i\}$ using some policy
 34 | 
 35 |    iterate $K$ times:
 36 | 2. set $y_i\leftarrow r(s_i,a_i)+\gamma\max_{a'_i}Q_{\phi}(s_i',a'_i)$
 37 | 3. set $\phi\leftarrow\arg\min_{\phi}\sum_i(y_i-Q_{\phi}(s_i,a_i))^2$
 38 | 
 39 | equivalent to find a function that satisfies Bellman equation
 40 | 
 41 | Advantage: off-policy, since $r(s_i,a_i)$ and $\max_{a'_i}Q_{\phi}(s_i',a'_i)$ is independent of the policy
 42 | 
 43 | Online Q-iteration:
 44 | 
 45 | 1. take action $a_i$ and observe $(s_i,a_i,s_i',r_i)$
 46 | 2. $y_i\leftarrow r(s_i,a_i)+\gamma\max_{a'_i}Q_{\phi}(s_i',a'_i)$
 47 | 3. $\phi\leftarrow\phi - \alpha\frac{\text{d}Q_{\phi}}{\text{d}\phi}(Q_{\phi}(s_i,a_i)-y_i)$
 48 | 
 49 | if always use $a_t=\arg\max_a Q_{\phi}(s_t,a)$, then it's easy to get stuck in local minimum, hence use $\epsilon$-greedy policy
 50 | 
 51 | $$
 52 | \pi(a|s)=\begin{cases}
 53 | 
 54 | 1-\epsilon & \text{if } a=\arg\max_a Q_{\phi}(s,a)\\
 55 | 
 56 | \frac{\epsilon}{|\mathcal{A}|-1} & \text{otherwise}
 57 | 
 58 | \end{cases}
 59 | $$
 60 | 
 61 | or Boltzmann policy
 62 | 
 63 | $$
 64 | \pi(a|s)=\frac{\exp(Q_{\phi}(s,a)/\tau)}{\sum_{a'}\exp(Q_{\phi}(s,a')/\tau)}
 65 | $$
 66 | 
 67 | but our target $|Q_{\phi}(s,a)-r(s,a)-\gamma\max_{a'}Q_{\phi}(s',a')|^2$ is not fixed, we may not converge to the optimal solution
 68 | 
 69 | Another solution: use replay buffer, sample a batch of transitions from the buffer, then update $Q_{\phi}$
 70 | 
 71 | ### Value function Theory
 72 | 
 73 | define operator $\mathcal{B}$
 74 | 
 75 | $$
 76 | \mathcal{B}V=\max_a r(s,a)+\gamma\mathbb{E}_{s'\sim p(s'|s,a)}[V(s')]
 77 | $$
 78 | 
 79 | then $V^*$ is a fixed point of $\mathcal{B}$, always exists and unique, and $\mathcal{B}$ is a contraction mapping, which means
 80 | 
 81 | $$
 82 | ||\mathcal{B}V-\mathcal{B}V'||_\infty\leq\gamma||V-V'||_\infty
 83 | $$
 84 | 
 85 | choose $V'=V^*$, then we know $||\mathcal{B}V-V^*||_\infty\leq\gamma||V-V^*||_\infty$
 86 | 
 87 | value iteration: $V\leftarrow\mathcal{B}V$
 88 | 
 89 | fitted value iteration: $V\leftarrow\Pi\mathcal{B}V$
 90 | 
 91 | where $\Pi$ is a projection operator, $\Pi V=\arg\min_{V'\in\Omega}||V'-V||_2$, which projects $V$ to the space of function that can be represented by $\phi$
 92 | 
 93 | $\mathcal{B}$ is a contraction mapping w.r.t. $||\cdot||_\infty$, $\Pi$ is a contraction mapping w.r.t. $||\cdot||_2$, but $\Pi\mathcal{B}$ is not a contraction mapping w.r.t. any norm. It often doesn't converge to $V^*$ in practice
 94 | 
 95 | $Q\leftarrow\Pi\mathcal{B}Q$ is not a contraction mapping, $Q$-learning is not a gradient descent since we don't backpropagate throught $y_i$
 96 | 
 97 | But we can make it work in practice !
 98 | 
 99 | ### Target Networks
100 | 
101 | Q-learning algorithm with buffer and target network:
102 | 
103 | 1. update $\phi'\leftarrow\tau\phi+(1-\tau)\phi'$
104 | 
105 |    iterate $N$ times:
106 | 2. collect $M$ transitions $\tau=\{s_i,a_i,s_i',r_i\}$ using some policy, add to $\mathcal{B}$
107 | 
108 |    iterate $K$ times:
109 | 3. sample a minibatch $\{s_i,a_i,s_i',r_i\}$ from $\mathcal{B}$
110 | 4. $\phi\leftarrow\phi-\alpha\sum_i\frac{\text{d}Q_{\phi}}{\text{d}\phi}(Q_{\phi}(s_i,a_i)-(r(s_i,a_i)+\gamma\max_{a'}Q_{\phi'}(s_i',a_i')))$
111 | 
112 | DQN: pick $N=1$, $K=1$
113 | 
114 | 1. take action $a_i$ and observe $(s_i,a_i,s_i',r_i)$, add it to $\mathcal{B}$
115 | 2. sample minibatch $\{s_i,a_i,s_i',r_i\}$ from $\mathcal{B}$
116 | 3. set $y_i=r_i+\gamma\max_{a'}Q_{\phi'}(s_i',a')$ using target network $Q_{\phi'}$
117 | 4. $\phi\leftarrow\phi-\alpha\frac{\text{d}Q_{\phi}}{\text{d}\phi}(Q_{\phi}(s_i,a_i)-y_i)$
118 | 5. update $\phi'\leftarrow\tau\phi+(1-\tau)\phi'$ ($\tau=0.001$ works well)
119 | 
120 | ![Alt Text](pic/Qlearning.png)
121 | 
122 | Online $Q$-learning: process 1, 2, 3 running at same speed
123 | 
124 | DQN: process 1, 3 at same speed, process 2 at slower speed
125 | 
126 | Fittered Q-iteration: process 3 faster than process 2, faster than process 1 (offline)
127 | 
128 | However, $Q$ often overestimate, since $Q_{\phi'}(s',a')$ is often noise, we use double Q-learning. Use current network to select action, use target network to evaluate the action
129 | 
130 | $$
131 | y_i=r_i+\gamma Q_{\phi'}(s_i',\arg\max_{a'}Q_{\phi}(s_i',a'))
132 | $$
133 | 
134 | $Q$-learning with $N$-step returns: define target as
135 | 
136 | $$
137 | y_{i,t}=\sum_{t'=t}^{t+N-1}\gamma^{t-t'}r_{j,t'}+\gamma^N\max_{a_{j,t+N}}Q_{\phi'}(s_{i,t+N},a_{j,t+N})
138 | $$
139 | 
140 | less biased target values when $Q$ is inaccurate, faster learning in early stage
141 | 
142 | Only actually correct when learning on-policy, we need $s_{j,t'},a_{j,t'}$ to be sampled from $\pi_{\theta}$
143 | 
144 | ### Continuous Action Spaces
145 | 
146 | Ways to pick $a=\arg\max_a Q_{\phi}(s,a)$:
147 | 
148 | 1. gradient descent on $Q_{\phi}(s,a)$ / choose from $N$ samples
149 | 2. use Normalized Advantage Functions: $Q_{\phi}(s,a)=-\frac{1}{2}(a-\mu_{\phi}(s))^TP_\phi(s)(a-\mu_{\phi}(s))+V_{\phi}(s)$, then $\mu_{\phi}(s)=\arg\max_a Q_{\phi}(s,a)$
150 | 3. train another network $\mu_\theta(s)$ s.t. $\mu_\theta(s)=\arg\max_a Q_{\phi}(s,a)$, then new target $y_i=r_i+\gamma Q_{\phi'}(s_i,\mu_{\theta}(s_i))$, update $\theta$ using $\nabla_{\theta}J(\theta)=\mathbb{E}(\nabla_\theta\mu_\theta(s)\nabla_a Q_\phi(s,\mu_\theta(s)))$
151 | 
152 | DDPG:
153 | 
154 | 1. take action $a_i$ and observe $(s_i,a_i,s_i',r_i)$, add it to $\mathcal{B}$
155 | 2. sample minibatch $\{s_i,a_i,s_i',r_i\}$ from $\mathcal{B}$
156 | 3. compute target $y_i=r_i+\gamma Q_{\phi'}(s_i',\mu_{\theta'}(s_i'))$
157 | 4. update $\phi\leftarrow\phi-\alpha\frac{\text{d}Q_{\phi}}{\text{d}\phi}(Q_{\phi}(s_i,a_i)-y_i)$
158 | 5. update $\theta\leftarrow\theta+\beta\frac{\text{d}\mu_{\theta}}{\text{d}\theta}\nabla_a Q_{\phi}(s_i,a_i)|_{a_i=\mu_{\theta}(s_i)}$
159 | 6. soft update $\phi'\leftarrow\tau\phi+(1-\tau)\phi'$, $\theta'\leftarrow\tau\theta+(1-\tau)\theta'$
160 | 
161 | Tips: Large replay buffer, start with high exploration, clip gradients, double Q-learning, run multiple random seeds
162 | 


--------------------------------------------------------------------------------
/lecture/notes-en/5-Model-Based-RL.md:
--------------------------------------------------------------------------------
  1 | ## Model-based RL
  2 | 
  3 | Learn the transition dynamics, then figure out how to choose actions
  4 | 
  5 | If we know the transition dynamics, how can we use it to optimize the policy ?
  6 | 
  7 | $$
  8 | a^*=\arg\max_a J(s,a)\quad s.t.\quad s_{t+1}=f(s_t,a_t)
  9 | $$
 10 | 
 11 | Discrete case: Monte Carlo Tree Search (MCTS), just the random version of dynamic programming
 12 | 
 13 | ### Trajectory optimization
 14 | 
 15 | we want to find a sequence of actions that maximize the reward, which is equivalent to solve
 16 | 
 17 | $$
 18 | \min_{u_1,\dots,u_T}\sum_{t=1}^T c(x_t,u_t)\quad s.t.\quad x_{t+1}=f(x_t,u_t)\\
 19 | 
 20 | =\min_{u_1,\dots,u_T}c(x_1,u_1)+c(f(x_1,u_1),u_2)+\dots+c(f(f(\dots),\dots),u_T)
 21 | $$
 22 | 
 23 | Linear case: LQR, suppose
 24 | 
 25 | $$
 26 | c(x_t,u_t)=\frac{1}{2}\begin{bmatrix}x_t\\u_t\end{bmatrix}^TC_t\begin{bmatrix}x_t\\u_t\end{bmatrix}+\begin{bmatrix}x_t\\ u_t \end{bmatrix}^T c_t\quad C_t=\begin{bmatrix}C_{x_T,x_T} & C_{x_T,u_T}\\ C_{u_T,x_T} & C_{u_T,u_T}\end{bmatrix},c_t=\begin{bmatrix}c_{x_T}\\ c_{u_T}\end{bmatrix}\\
 27 | 
 28 | f(x_t,u_t)=F_t\begin{bmatrix}x_t\\ u_t\end{bmatrix}+f_t
 29 | $$
 30 | 
 31 | if we only want to solve for $u_T$ only, $u_T=-C_{u_T,u_T}^{-1}(C_{u_T,x_T}x_T+c_{u_T})=K_Tx_T+k_T$, which is a linear function of $x_T$, then we can solve $x_T$ by minimizing the following cost function
 32 | 
 33 | $$
 34 | V(x_T)=const+\frac{1}{2}\begin{bmatrix}x_T\\ K_Tx_T+k_T\end{bmatrix}^TC_T\begin{bmatrix}x_T\\ K_Tx_T+k_T\end{bmatrix}+\begin{bmatrix}x_T\\ K_Tx_T+k_T\end{bmatrix}^Tc_T
 35 | $$
 36 | 
 37 | repeating the process backwards, we can get the optimal action sequence. We can also approximate a nonlinear system as a linear-quadratic system using Taylor expansion
 38 | 
 39 | ### Learning dynamics
 40 | 
 41 | 1. run base policy $\pi_0(a_t|s_t)$, collect data $\mathcal{D}=\{(s,a,s')_i\}$
 42 | 
 43 |    repeat $N$ times:
 44 | 2. learn dynamics model $f_{\theta}(s,a)$ by minimizing $\sum_i||f_{\theta}(s_i,a_i)-s'_i||^2$
 45 | 3. plan through $f(s,a)$ to choose actions
 46 | 4. execute the first planned action, observe resulting state $s'$
 47 | 5. add a batch of $(s,a,s')$ to $\mathcal{D}$
 48 | 
 49 | First take uncertain actions, then become more certain through learning. However, this is suboptimal, we only observe $s_1$ in open-loop case, we can use closed-loop control to improve the performance
 50 | 
 51 | $$
 52 | \pi=\arg\max_{\pi}\mathbb{E}_{\tau\sim p(\tau)}[\sum_t r(s_t,a_t)]
 53 | $$
 54 | 
 55 | where $p(\tau)=p(s_1)\prod_{t=1}^Tp(s_{t+1}|s_t,a_t)p(a_t|s_t)$, we just backpropagate directly into the policy
 56 | 
 57 | 1. run base policy $\pi_0(a_t|s_t)$, collect data $\mathcal{D}=\{(s,a,s')_i\}$
 58 | 2. learn dynamics model $f_{\theta}(s,a)$ by minimizing $\sum_i||f_{\theta}(s_i,a_i)-s'_i||^2$
 59 | 3. pick $s_i$ from $\mathcal{D}$, use $f_{\theta}(s,a)$ to make short rollouts from them
 60 | 4. use both real data and simulated data to improve $\pi_{\theta}$ with off-policy RL
 61 | 5. run $\pi_{\theta}$, collect data $(s,a,s')$ and add to $\mathcal{D}$, go to 2 until convergence
 62 | 
 63 | Problem: when optimizing, we neither optimize over the real data nor the simulated data, we optimize over the mixture of them, which is not guaranteed to converge
 64 | 
 65 | ### Dyna-Style algorithms
 66 | 
 67 | online Q-learning that performs model-free RL with a model
 68 | 
 69 | 1. given state $s$, pick action $a$ using exploration policy
 70 | 2. observe reward $r$ and next state $s'$, get transition $(s,a,s',r)$
 71 | 3. update model $\hat{p}(s'|s,a)$ and $\hat{r}(s,a)$ using $(s,a,s',r)$
 72 | 4. $Q(s,a)\leftarrow Q(s,a)+\alpha\mathbb{E}_{s',r\sim\mathcal{D}}(r+\gamma\max_{a'}Q(s',a')-Q(s,a))$
 73 | 
 74 |    repeat $K$ times:
 75 | 5. sample $(s,a)$ from $\mathcal{B}$, simulate $s'$ using $\hat{p}(s'|s,a)$ and $\hat{r}(s,a)$
 76 | 6. $Q(s,a)\leftarrow Q(s,a)+\alpha\mathbb{E}_{s'\sim\hat{p}(s'|s,a),r\sim\hat{r}(s,a)}(r+\gamma\max_{a'}Q(s',a')-Q(s,a))$
 77 | 
 78 | MBA, MVE, MBPO: similar as Dyna-Style algorithms, but totally use model to generate data for policy optimization
 79 | 
 80 | ### Successor Representations
 81 | 
 82 | Suppose reward is action-dependent (this is reasonable in some way by defining $\mathcal{S'}=\mathcal{S}\times\mathcal{A}$ and suppose reward return a step later), then
 83 | 
 84 | $$
 85 | V^\pi(s_t)=\sum_{t'=t}^\infty\gamma^{t'-t}\mathbb{E}_{p(s_{t'}|s_t)}[r(s_{t'})]=\sum_s\sum_{t'=t}^\infty\gamma^{t'-t}p(s_{t'}=s|s_t)r(s)
 86 | $$
 87 | 
 88 | define $\mu_{s_i}^\pi(s_t)=(1-\gamma)\sum_{t'=t}^\infty\gamma^{t'-t}p(s_{t'}=s_i|s_t)$, which is the probability of reaching state $s_i$ from state $s_t$ under policy $\pi$, then
 89 | 
 90 | $$
 91 | V^\pi(s_t)=\frac{1}{1-\gamma}\sum_s\mu_{s_i}^\pi(s_t)r(s_i)=\frac{1}{1-\gamma}\mu^{\pi}(s_t)^T r
 92 | $$
 93 | 
 94 | this is called successor representation. Similar as Bellman equation
 95 | 
 96 | $$
 97 | \mu^{\pi}_{s_i}(s_t)=(1-\gamma)\delta(s_t=s_i)+\gamma\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t),a_t\sim\pi(a_t|s_t)}[\mu^{\pi}_{s_i}(s_{t+1})]
 98 | $$
 99 | 
100 | However, if number of states is much larger, it's hard to compute $\mu^{\pi}$, we can use Successor features: $\psi^{\pi}_j(s_t)=\sum_s\mu^{\pi}_{s}(s_t)\phi_j(s)$, then if $r(s)=\sum_j\phi_j(s)w_j=\phi(s)^T w$
101 | 
102 | $$
103 | V^\pi(s_t)=\frac{1}{1-\gamma}\mu^{\pi}(s_t)^T r=\frac{1}{1-\gamma}\psi^\pi(s_t)^T w
104 | $$
105 | 
106 | if number of features is much less than the number of states, learning them is much easier. From definition
107 | 
108 | $$
109 | \psi_j^{\pi}(s_t)=\phi_j(s_t)+\gamma\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t),a_t\sim\pi(a_t|s_t)}[\psi_j^{\pi}(s_{t+1})]
110 | $$
111 | 
112 | also construct a $Q$-function like version:
113 | 
114 | $$
115 | \psi^{\pi}_j(s_t,a_t)=r(s_t)+\gamma\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t),a_{t+1}\sim\pi(a_{t+1}|s_{t+1})}[\psi_j^{\pi}(s_{t+1},a_{t+1})]\\
116 | 
117 | Q^{\pi}(s_t,a_t)=\sum_j\psi_j^{\pi}(s_t,a_t)w_j=\psi^{\pi}(s_t,a_t)^T w
118 | $$
119 | 
120 | Using successor features to recover many $Q$-functions
121 | 
122 | 1. Train $\psi^{\pi_k}(s_t,a_t)$ for policies $\pi_k$ via Bellman equation
123 | 2. Get reward samples $\{s_i,r_i\}$, then solve $\arg\min_w\sum_i||\phi(s_i)^T w-r_i||^2$
124 | 3. Recover $Q^{\pi_k}(s_t,a_t)=\psi^{\pi_k}(s_t,a_t)^T w$
125 | 
126 | ### Uncertainty-Aware Neural Net Models:
127 | 
128 | Two types of uncertainty: aleatoric uncertainty (inherent noise in the system) and epistemic uncertainty (uncertainty in the model)
129 | 
130 | We can use output entropy to measure aleatoric uncertainty, use multiple models to measure epistemic uncertainty
131 | 
132 | Bayesian Neural Networks:
133 | 
134 | For standard neural networks, we optimize
135 | 
136 | $$
137 | L(D,w)=\sum_{x_i,y_i\in D}||y_i-f(x_i,w)||^2+\lambda\sum_d w_d^2\\
138 | 
139 | \log p(D,w)=\sum_{x_i,y_i\in D}\log\mathcal{N}(y_i|f(x_i,w),I)+\sum_d\mathcal{N}(w_d|0,\frac{1}{\sqrt{\lambda}})
140 | $$
141 | 
142 | which overfits the data when $p(D_{\text{data}})$ is different from $p(D_{\text{test}})$. However, Bayesian Inference computes the posterior distribution of $w$ given $D$ from prior distribution $p(w)$
143 | 
144 | $$
145 | p(w|D)=\frac{p(D|w)p(w)}{p(D)}=\frac{p(D|w)p(w)}{\int p(D|w')p(w')\text{d}w'}
146 | $$
147 | 
148 | using the posterior distribution, we can compute the predictive distribution
149 | 
150 | $$
151 | p(\hat{y}(x)|D)=\int p(\hat{y}(x)|w)p(w|D)\text{d}w=\mathbb{E}_{p(w|D)}[p(\hat{y}(x)|w)]
152 | $$
153 | 
154 | which means sampling networks from $p(w|D)$, we can quantify our uncertainty about these things, e.g., by computing their variance.
155 | 
156 | The difficulty is computing the integral, we can use Monte Carlo sampling
157 | 
158 | $$
159 | p(D)=\mathbb{E}_{p(w)}(p(D|w))\approx\frac{1}{N}\sum_{i=1}^Np(D|w_i)\\
160 | 
161 | p(\hat{y}(x)|d)=\mathbb{E}_{p(w)}[p(\hat{y}(x)|w)\frac{p(d|w)}{p(d)}]\approx\frac{1}{N}\sum_{i=1}^Np(\hat{y}(x)|w_i)\frac{p(d|w_i)}{p(d)}
162 | $$
163 | 
164 | but since $w$ is high-dimensional, sample with low variance is difficult. We can use Markov Chain Monte Carlo (MCMC) to sample: use the Markov chain to generate candidate samples and then stochastically accept them with probability
165 | 
166 | $$
167 | a=q(w'|w_t)=\min(1,\frac{p(w',D)}{p(w_t,D)})
168 | $$
169 | 
170 | Metropolis-Hastings algorithm, we can choose $q(w'|w_t)=\mathcal{N}(w_t,\sigma^2)$
171 | 
172 | Variational Inference: approximate the posterior distribution with a simpler distribution $q_\phi(w)$, then minimize the KL-divergence between $q_\phi(w)$ and $p(w|D)$
173 | 
174 | $$
175 | d_{KL}(q_\phi(w)||p(w|D))=\mathbb{E}_{q_\phi(w)}[\log\frac{q_\phi(w)}{p(w|D)}]=\mathbb{E}_{q_\phi(w)}[\log q_\phi(w)-p(w,D)]+\log p(D)
176 | $$
177 | 
178 | we can minimize the KL-divergence by maximizing the evidence lower bound (ELBO)
179 | 
180 | $$
181 | \mathcal{L}(\phi)=\mathbb{E}_{q_\phi(w)}[p(w,D)-\log q_\phi(w)]=\mathbb{E}_{x,y\in D}\mathbb{E}_{q_\phi(w)}[\log p(\hat{y}(x)=y|w)+\log p(w)-\log q_\phi(w)]
182 | $$
183 | 
184 | we pick $\mathcal{N}(\mu,\sigma)$ as $q_\phi(w)$, then we can compute the ELBO
185 | 
186 | $$
187 | \mathcal{L}(\phi)=\frac{1}{N}\sum_{i=1}^N\nabla_{\mu,\sigma}\mathbb{E}_{w\sim q_{\mu,\sigma}}[\log p(\hat{y}(x_i)=y_i|w)+\log p(w)-\log q_{\mu,\sigma}(w)]\\
188 | 
189 | =\frac{1}{N}\sum_{i=1}^N\sum_{j=1}^S \nabla_{\mu,\sigma}\log p(\hat{y}(x_i)=y_i|\mu+\sigma\epsilon_j)+\log p(\mu+\sigma\epsilon_j)
190 | $$
191 | 
192 | after computing $\phi^*$, we compute $p(\hat{y}(x)|D)$ by $\mathbb{E}_{q_{\phi^*}(w)}[p(\hat{y}(x)|w)]$
193 | 
194 | ### Latent Space Models
195 | 
196 | Observation model: $p(o_t|s_t)$, high-dimensional, not dynamic
197 | 
198 | dynamics model: $p(s_{t+1}|s_t,a_t)$, low-dimensional, dynamic
199 | 
200 | reward model: $p(r_t|s_t,a_t)$
201 | 
202 | To build a latent space model
203 | 
204 | $$
205 | \max_\phi\frac{1}{N}\sum_{i=1}^N\sum_{t=1}^T\mathbb{E}_{(s_t,s_{t+1})\sim p(s_t,s_{t+1}|o_{1:T
206 | 
207 | },a_{1:T})}[\log p_\phi(s_{t+1,i}|s_{t,i},a_{t,i})+\log p_\phi(o_{t,i}|s_{t,i})]
208 | $$
209 | 
210 | learn approximate posterior $q_\psi(s_{t}|o_{1:t},a_{1:t})$ as encoder ($q_\psi(s_t,s_{t+1}|o_{1:T},a_{1:T})$ is most accurate but hard to learn, $q_\psi(s_t|o_t)$ is simplest but less accurate)
211 | 
212 | For simplicity, consider single-step encoder $q_\psi(s_t|o_t)$, our goal is
213 | 
214 | $$
215 | \max_{\phi,\psi}\frac{1}{N}\sum_{i=1}^N\sum_{t=1}^T\mathbb{E}_{s_t\sim q_\psi(s_t|o_t),s_{t+1}\sim q_\psi(s_{t+1}|o_{t+1})}[\log p_\phi(s_{t+1,i}|s_{t,i},a_{t,i})+\log p_\phi(o_{t,i}|s_{t,i})]
216 | $$
217 | 
218 | if $q_\psi(s_t|o_t)$ is deterministic, which means $q_\psi(s_t|o_t)=\delta(s_t=g_\psi(o_t))$
219 | 
220 | $$
221 | \max_{\phi,\psi}\frac{1}{N}\sum_{i=1}^N\sum_{t=1}^T[\log p_\phi(g_\psi(o_{t+1,i})|g_\psi(o_{t,i}),a_{t,i})+\log p_\phi(o_{t,i}|g_\psi(o_{t,i}))]
222 | $$
223 | 
224 | backpropagate to train, we can also add reward model to the objective
225 | 
226 | $$
227 | \max_{\phi,\psi}\frac{1}{N}\sum_{i=1}^N\sum_{t=1}^T[\log p_\phi(g_\psi(o_{t+1,i})|g_\psi(o_{t,i}),a_{t,i})+\log p_\phi(o_{t,i}|g_\psi(o_{t,i}))+\log p_\phi(r_{t,i}|g_\psi(o_{t,i}),a_{t,i})]
228 | $$
229 | 
230 | learn from $p(o_{t+1}|o_t,a_t)$ directly is also a good choice
231 | 


--------------------------------------------------------------------------------
/lecture/notes-en/6-Exploration.md:
--------------------------------------------------------------------------------
  1 | ## Exploration
  2 | 
  3 | Regret: $\text{Reg}(T)=TE(r(a^*))-\sum_{t=1}^T r(a_t)$, where $a^*$ is the optimal action
  4 | 
  5 | ### Optimistic Exploration
  6 | 
  7 | use upper confidence bound (UCB) to choose actions
  8 | 
  9 | $$
 10 | a_t=\arg\max_a\left(Q(s_t,a)+c\sqrt{\frac{\log t}{N(s_t,a)}}\right)
 11 | $$
 12 | 
 13 | where $N(s_t,a)$ is the number of times action $a$ has been selected in state $s_t$. The intuition is that we should try actions that we are uncertain about or have high $Q$-values.
 14 | 
 15 | Trouble with counts is in high-dimensional continuous action spaces, we never visit the same state-action pair twice.
 16 | 
 17 | Idea: fit density model $p_\theta(s)$, when observing a new state $s_i$, fit new model $p_{\theta'}(s)$ using $\mathcal{D}\cup\{s_i\}$, then we get pseudo-counts
 18 | 
 19 | $$
 20 | p_{\theta}(s_i)=\frac{\hat{N}(s_i)}{\hat{n}}\quad p_{\theta'}(s_i)=\frac{\hat{N}(s_i)+1}{\hat{n}+1}\\
 21 | 
 22 | \hat{N}(s_i)=\frac{1-p_{\theta'}(s_i)}{p_\theta'(s_i)-p_\theta(s_i)}p_\theta(s_i)
 23 | $$
 24 | 
 25 | another way: counting with hashes: compress $s$ into a $k$-bit code via $\phi(s)$, use $N(\phi(s))$ to count the number of times we have visited state $s$. Improve by learning a compression
 26 | 
 27 | ### Posterior Sampling
 28 | 
 29 | assume the reward is drawn from a distribution like
 30 | 
 31 | $$
 32 | f(x;\alpha,\beta)=\frac{x^{\alpha-1}(1-x)^{\beta-1}}{B(\alpha,\beta)}
 33 | $$
 34 | 
 35 | update $\alpha$ and $\beta$ using the reward, $\alpha_k\leftarrow\alpha_k+r_t,\beta_k\leftarrow\beta_k+1-r_t$ if action $k$ is selected
 36 | 
 37 | 1. sample $Q$ from $p(Q)$
 38 | 2. act according to $Q$ for one episode
 39 | 3. update $p(Q)$
 40 | 
 41 | how to represent a distribution over functions? Sample from dataset, train ensemble of models, then choose randomly. To avoid train $N$ models, we can share the model except the last layer
 42 | 
 43 | ### Information Gain ?
 44 | 
 45 | define $\mathcal{H}(\hat{p}(z))$ be the current entropy of $z$ estimate, $\mathcal{H}(\hat{p}(z|y))$ be the entropy of $z$ after observing $y$, then the information gain is
 46 | 
 47 | $$
 48 | IG(z,y|a)=\mathbb{E}_y[\mathcal{H}(\hat{p}(z))-\mathcal{H}(\hat{p}(z|y))|a]
 49 | $$
 50 | 
 51 | which defines how much we learn about $z$ from taking action $a$ after observing $y$. Also, define expected suboptimality of $a$ as $\delta(a)=\mathbb{E}[r(a^*)-r(a)]$, choose $a$ according to
 52 | 
 53 | $$
 54 | \arg\min_a\frac{\delta^2(a)}{IG(z,y|a)}
 55 | $$
 56 | 
 57 | how to pick $y$ ? We can use variational inference:
 58 | 
 59 | IG can be written as $D_{KL}(p(z|y)||p(z))$, we want to learn about transitions $p_\theta(s_{t+1}|s_t,a_t)$, so pick $z=\theta,y=h,a=(s_t,a_t,s_{t+1})$, where $h$ is history of all prior transitions
 60 | 
 61 | $$
 62 | IG=D_{KL}(p(\theta|h,s_t,a_t,s_{t+1})||p(\theta|h))
 63 | $$
 64 | 
 65 | Intuition: a transition is informative if it changes our beliefs about the transition model. Use variational inference to approximate the posterior distribution $q(\theta|\phi)\approx p(\theta|h)$, e.g. a product of independent Gaussian parameter distributions
 66 | 
 67 | $$
 68 | \min_{\phi} D_{KL}(q(\theta|\phi)||p(h|\theta)p(\theta))
 69 | $$
 70 | 
 71 | give new transition $(s_t,a_t,s_{t+1})$, update $\phi$ to get $\phi'$, then
 72 | 
 73 | $$
 74 | IG=D_{KL}(q(\theta|\phi')||q(\theta|\phi))
 75 | $$
 76 | 
 77 | ### Learn without Rewards
 78 | 
 79 | Define mutual information
 80 | 
 81 | $$
 82 | \mathcal{I}(x;y)=D_{KL}(p(x,y)||p(x)p(y))=\mathcal{H}(p(y))-\mathcal{H}(p(y|x))
 83 | $$
 84 | 
 85 | define $z$ as vectors in the latent space, $x$ as the current state, we can learn without any rewards
 86 | 
 87 | 1. Propose goal: $z_g\sim p(z),x_g\sim p_\theta(x_g|z_g)$
 88 | 2. Attemept to reach goal using $\pi(a|x,x_g)$, reach $\hat{x}$
 89 | 3. use data to update $\pi$, $p_\theta(x_g|z_g),q_\phi(z_g|x_g)$
 90 | 
 91 | To get diverse goals, update $p_\theta,q_\phi$ using
 92 | 
 93 | $$
 94 | \theta,\phi=\arg\max_{\theta,\phi}\mathbb{E}(w(\hat{x})\log p(\hat{x}))
 95 | $$
 96 | 
 97 | where $w(\hat{x})=p_\theta(\hat{x})^\alpha$, the key result is for any $\alpha\in[-1,0)$, $\mathcal{H}(p_\theta(x))$ increases
 98 | 
 99 | Update $\pi$: we want to train $\pi(a|S,G)$ to reach goal $G$. As $\pi$ gets better, final state $S$ gets close to $G$, which means $p(G|S)$ becomes more deterministic, $\mathcal{H}(p(G|S))$ decreases. To have a good exploration, we need to increase $\mathcal{H}(p(G))$, the objective is
100 | 
101 | $$
102 | \max\mathcal{H}(p(G))-\mathcal{H}(p(G|S))=\max\mathcal{I}(S;G)
103 | $$
104 | 
105 | ### Random Network Distillation
106 | 
107 | Three possible factors cause next-state prediction error:
108 | 
109 | 1. Prediction error is high where the predictor fails to generalize from previously seen examples. Novel experience then corresponds to high prediction error.
110 | 2. Prediction target is stochastic.
111 | 3. Necessary for the prediction is missing, or the model class of predictors is too limited to fit the complexity of the target function.
112 | 
113 | To explore, we want to emphasize on the first factor. Use a random network $f_\theta^*(s')$ to predict the next state, then trains another network $\hat{f}_\phi(s')$ to predict the next state
114 | 
115 | $$
116 | \phi^*=\arg\min_\phi\mathbb{E}_{s,a,s'\sim\mathcal{D}}||\hat{f}_\phi(s)-f_\theta^*(s)||
117 | $$
118 | 
119 | for states we already seen, $\hat{f}_\phi(s)$ is close to $f_\theta^*(s)$, for states we haven't seen, $||\hat{f}_\phi(s)-f_\theta^*(s)||$ is expected to be high
120 | 
121 | Add a bonus to the reward function $\tilde{r}(s)=r(s)+\lambda||\hat{f}_\phi(s)-f_\theta^*(s)||^2$, then train $\pi$ to maximize $E_{\pi}(\tilde{r}(s))$ using $Q$-learning
122 | 
123 | ### Others
124 | 
125 | Intrinsic motivation: add a reward bonus to the reward function $\tilde{r}(s,a)=r(s,a)-\log p_\pi(s)$.
126 | 
127 | Repeating update $\pi(a|s)$ to maximize $E_{\pi}(\tilde{r}(s,a))$, then update $p_\pi(s)$ to fit the new state distribution
128 | 
129 | State Marginal Matching: Suppose we want to learn $\pi(a|s)$ so as to minimize $D_{KL}(p_\pi(s)||p^*(s))$. In iteration $k$
130 | 
131 | 1. Learn $\pi^k(a|s)$ to maximize $\mathbb{E}_{\pi^k}(\tilde{r}(s,a))$, where $\tilde{r}(s,a)=\log p^*(s)-\log p_\pi(s)$
132 | 2. update $p_{\pi^k}(s)$ to fit all states seen so far
133 | 
134 | Finally, return $\pi^*(a|s)=\sum_k\pi^k(a|s)$, which is a Nash equilibrium of game between $\pi$ and $p_\pi(s)$
135 | 
136 | Learning diversity skills: different skills should visit different state-space regions, we want to learn $\pi(a|s,z)$ where $z$ is task.
137 | 
138 | $$
139 | \pi(a|s,z)=\arg\max_\pi\sum_z\mathbb{E}_{s\sim\pi(s|z)}(r(s,z))
140 | $$
141 | 
142 | where $r(s,z)=\log p_\phi(z|s)-\log p(z)$, $p_\phi$ is a discriminator that tries to predict the task $z$ from state $s$. Iteratively update $\pi$ and $p_\phi$. It's equivalent to learn a policy that maximizes the mutual information $\mathcal{I}(s;z)$
143 | 


--------------------------------------------------------------------------------
/lecture/notes-en/7-Offline-RL.md:
--------------------------------------------------------------------------------
  1 | ## Offline RL
  2 | 
  3 | Offline RL problems:
  4 | 
  5 | off-policy evaluation(OPE): given $\mathcal{D}$, estimate $J(\pi)=\mathbb{E}_\pi(\sum_{t=1}^Tr(s_t,a_t))$
  6 | 
  7 | offline RL: given $\mathcal{D}$, learn best possible policy $\pi_\theta$
  8 | 
  9 | ### Batch RL
 10 | 
 11 | #### Via policy gradient
 12 | 
 13 | RL objective
 14 | 
 15 | $$
 16 | \max_\pi\sum_{t=0}^T\mathbb{E}_{s_t\sim d^\pi(s),a_t\sim\pi(a|s)}[\gamma^t r(s_t,a_t)]
 17 | $$
 18 | 
 19 | $d^\pi(s)$ is the state distribution choosen from data $\mathcal{D}$ under policy $\pi$, use importance sampling
 20 | 
 21 | $$
 22 | \nabla_\theta J(\theta)=\frac{1}{N}\sum_{i=1}^N\frac{\pi_\theta(\tau_i)}{\pi_\beta(\tau_i)}\sum_{t=0}^T\nabla_\theta\gamma^t\log\pi_\theta(a_{t,i}|s_{t,i})\hat{Q}(s_{i,t},a_{i,t})\\
 23 | 
 24 | \approx\frac{1}{N}\sum_{i=1}^N\sum_{t=0}^T\left(\prod_{t'=0}^{t-1}\frac{\pi_\theta(a_{t',i}|s_{t',i})}{\pi_\beta(a_{t',i}|s_{t',i})}\right)\nabla_\theta\log\pi_\theta(a_{t,i}|s_{t,i})\left(\prod_{t'=t}^{T}\frac{\pi_\theta(a_{t',i}|s_{t',i})}{\pi_\beta(a_{t',i}|s_{t',i})}\right)\hat{Q}(s_{i,t},a_{i,t})
 25 | $$
 26 | 
 27 | the first term accouts for difference in probability of landing in $s_{t,i}$, the second term accounts for having different actions in the future. If the difference between $\pi_\theta$ and $\pi_\beta$ is not too large, we can throw away first term
 28 | 
 29 | $$
 30 | \nabla_\theta J(\theta)\approx\frac{1}{N}\sum_{i=1}^N\sum_{t=0}^T\nabla_\theta\log\pi_\theta(a_{t,i}|s_{t,i})\left(\prod_{t'=t}^{T}\frac{\pi_\theta(a_{t',i}|s_{t',i})}{\pi_\beta(a_{t',i}|s_{t',i})}\right)\hat{Q}(s_{i,t},a_{i,t})
 31 | $$
 32 | 
 33 | we know $\hat{Q}(s_{i,t},a_{i,t})=\sum_{t'=t}^{t'=T}\gamma^{t'-t}r(s_{i,t'},a_{i,t'})$, and later action doesn't affect current reward, hence
 34 | 
 35 | $$
 36 | \nabla_\theta J(\theta)\approx\frac{1}{N}\sum_{i=1}^N\sum_{t=0}^T\nabla_\theta\log\pi_\theta(a_{t,i}|s_{t,i})\sum_{t'=t}^{T}\left(\prod_{t''=t}^{t'}\frac{\pi_\theta(a_{t'',i}|s_{t'',i})}{\pi_\beta(a_{t'',i}|s_{t'',i})}\right)\gamma^{t'-t}r(s_{i,t'},a_{i,t'})
 37 | $$
 38 | 
 39 | still exponentially exploding importance weights, must use value function estimation
 40 | 
 41 | Doubly Robust Estimator:
 42 | 
 43 | $$
 44 | V^{\pi_\theta}(s_t)\approx\sum_{t'=t}^{T}\left(\prod_{t''=t}^{t'}\frac{\pi_\theta(a_{t'',i}|s_{t'',i})}{\pi_\beta(a_{t'',i}|s_{t'',i})}\right)\gamma^{t'-t}r(s_{i,t'},a_{i,t'})=\sum_{t'=t}^T\left(\prod_{t''=t}^{t'}\rho_{t'}\right)\gamma^{t'-t}r(s_{i,t'},a_{i,t'})
 45 | $$
 46 | 
 47 | the recursive formula is
 48 | 
 49 | $$
 50 | V^{\pi_\theta}(s_t)=\rho_t(r(s_t,a_t)+\gamma V^{\pi_\theta}(s_{t+1}))
 51 | $$
 52 | 
 53 | use Doubly Robust Estimation:
 54 | 
 55 | $$
 56 | V_{DR}(s)=\hat{V}(s)+\rho(s,a)(r_{s,a}-\hat{Q}(s,a))=\mathbb{E}_{a\sim\pi_\beta(a|s)}[\rho(s,a)\hat{Q}(s,a)]+\rho(s,a)(r_{s,a}-\hat{Q}(s,a))
 57 | $$
 58 | 
 59 | where $\hat{V}(s)$ and $\hat{Q}(s,a)$ is a model, if $\hat{Q}(s,a)$ and $\rho$ are independent, then $V_{DR}(s)$ is unbiased, with a smaller variance. Hence
 60 | 
 61 | $$
 62 | V_{DR}^{\pi_\theta}(s_t)=\hat{V}(s_t)+\rho_t(r(s_t,a_t)+\gamma V_{DR}^{\pi_\theta}(s_{t+1})-\hat{Q}(s_t,a_t))
 63 | $$
 64 | 
 65 | we can use this to learn $V_{DR}$, then estimate $\nabla_\theta J(\theta)$
 66 | 
 67 | Marginalized Importance Sampling: instead of using $\frac{\pi_\theta(\tau_i)}{\pi_\beta(\tau_i)}$, we estimate
 68 | 
 69 | $$
 70 | w(s,a)=\frac{d^{\pi_\theta}(s,a)}{d^{\pi_\beta}(s,a)}\qquad J(\theta)=\frac{1}{N}\sum_i w(s_i,a_i)r_i
 71 | $$
 72 | 
 73 | we can estimate $w(s,a)$ using
 74 | 
 75 | $$
 76 | d^{\pi_\beta}(s,a)w(s,a)=(1-\gamma)p_0(s)\pi_\theta(a|s)+\gamma\sum_{s',a'}\pi_\theta(a|s)p(s|s',a')d^{\pi_\beta}(s',a')w(s',a')
 77 | $$
 78 | 
 79 | the first term is probability of starting in $(s,a)$, the second part is the probability of reaching $s$ from $(s',a')$ and then taking action $a$. Solving $w(s,a)$ is equivalent to solving some fixed point
 80 | 
 81 | #### Via Linear Fitted Value Functions
 82 | 
 83 | suppose we have a linear model with feature matrix $\phi, |\mathcal{S}|\times K$, from previous discussion about `Successor Representations`, we know $r(s)=\phi(s)^T w$, then the reward model and transition model can be written as
 84 | 
 85 | $$
 86 | \phi w_r=r\quad\phi P_\phi=P^{\pi}\phi
 87 | $$
 88 | 
 89 | where $P^{\pi}$ is the transition matrix under policy $\pi$, $P_\phi$ is estimated feature-space transition matrix. Use least square
 90 | 
 91 | $$
 92 | w_r=(\phi^T\phi)^{-1}\phi^Tr\quad P_\phi=(\phi^T\phi)^{-1}\phi^TP^{\pi}\phi
 93 | $$
 94 | 
 95 | then we can estimate $V^{\pi}\approx\phi w_v$ where $V^\pi=r+\gamma P^{\pi}V^{\pi}$, so
 96 | 
 97 | $$
 98 | w_V=(\phi^T\phi-\gamma\phi^TP^{\pi}\phi)^{-1}\phi^Tr
 99 | $$
100 | 
101 | we can replace $\phi$ with a $|\mathcal{D}|\times K$ matrix, for each $\{s_i,a_i,r_i,s_i'\}$ in $\mathcal{D}$, we have $\phi_i=\phi(s_i),\phi_i'=\phi(s_i'),r_i=r(s_i,a_i)$, then
102 | 
103 | $$
104 | w_V=(\phi^T\phi-\gamma\phi^T\phi')^{-1}\phi^Tr
105 | $$
106 | 
107 | However, $\phi'=P^{\pi}\phi$, is sampling from $\pi$, this is not possible in offline RL. Solution: use feature matrix for both states and actions, $\phi, |\mathcal{S}||\mathcal{A}|\times K$, then
108 | 
109 | $$
110 | w_Q=(\phi^T\phi-\gamma\phi^T\phi')^{-1}\phi^Tr
111 | $$
112 | 
113 | where $\phi'_i=\phi(s_i',\pi(s_i'))$, our algorithm is
114 | 
115 | 1. compute $w_Q$ for $\pi_k$
116 | 2. $\pi_{k+1}(s)=\arg\max_a\phi(s,a)^T w_Q$
117 | 3. set $\phi'_i=\phi(s_i',\pi_{k+1}(s_i'))$, repeat until convergence
118 | 
119 | however, it doesn't work well in practice, since action distributional shift
120 | 
121 | ### Policy Constraint Optimization
122 | 
123 | #### Advantage Weighted Actor Critic
124 | 
125 | One way to avoid distributional shift is constraint on $D_{KL}(\pi_{\theta}||\pi_{\beta})$, since
126 | 
127 | $$
128 | D_{KL}(\pi||\pi_\beta)=-\mathbb{E}_{\pi}(\log\pi_\beta(a|s))-\mathcal{H}(\pi)
129 | $$
130 | 
131 | use Lagrange multiplier, we can get
132 | 
133 | $$
134 | \theta=\arg\max_\theta\mathbb{E}_{s\sim\mathcal{D}}(\mathbb{E}_{a\sim\pi_\theta(a|s)}[A(s,a)+\lambda\log\pi_\beta(a|s)]+\lambda\mathcal{H}(\pi_\theta(a|s)))
135 | $$
136 | 
137 | solve
138 | 
139 | $$
140 | \pi^*(a|s)=\frac{1}{Z(s)}\pi_\beta(a|s)\exp(\frac{1}{\lambda}A(s,a))
141 | $$
142 | 
143 | we can approximate this via weighted maximum likelihood
144 | 
145 | $$
146 | \pi(a|s)=\arg\max_\pi\mathbb{E}_{(s,a)\sim\pi_\beta}[\log\pi(a|s)\frac{1}{Z(s)}\exp(\frac{1}{\lambda}A_\phi(s,a))]
147 | $$
148 | 
149 | we can iteratively update $\pi$ and $A_\phi$ until convergence.
150 | 
151 | #### Implicit Q-learning
152 | 
153 | if using $Q$-learning, we often overestimate $\max_a Q(s,a)$ with some unseen actions, so use four different functions: target $Q$ function $Q_{\hat{\theta}}$, $Q$ function $Q_\theta$, and value function $V_\phi$, and target value function $V_{\hat{\phi}}$. Using $V_\phi$ significantly reduces the overestimation of $Q$-values
154 | 
155 | Repeat: 1. update Value function w.r.t $L_V(\phi)=\mathbb{E}_{(s,a)\sim\mathcal{D}}[L_2^\tau(Q_\theta(s,a)-V_\phi(s))]$ where $L_2^\tau(x)=\begin{cases}(1-\tau)x^2 & x>0\\ \tau x^2 & \text{else}\end{cases}$
156 | 
157 | 2. update $Q$ function w.r.t $L_Q(\theta)=\mathbb{E}_{(s,a,s')\sim\mathcal{D}}[(Q_\theta(s,a)-(r(s,a)+\gamma V_{\hat{\phi}}(s')))^2]$
158 | 3. update target $Q$ function $Q_{\hat{\theta}}\leftarrow\alpha Q_{\hat{\theta}}+(1-\alpha)Q_\theta$
159 | 4. update target value function $V_{\hat{\phi}}\leftarrow\alpha V_{\hat{\phi}}+(1-\alpha)V_\phi$
160 | 
161 | end repeat
162 | 
163 | 5. Update policy using objective
164 | 
165 | $$
166 | L_\pi(\psi)=-\mathbb{E}_{(s,a)\sim\mathcal{D}}[\log\pi_\psi(a|s)\exp(\frac{1}{\lambda}(Q_\theta(s,a)-V_\phi(s)))]
167 | $$
168 | 
169 | when using big $\tau$, we are more likely to choose $\pi(a|s)=\delta(a=\arg\max_{a'\in\mathcal{A},\pi_\beta(a'|s')>0}Q(s,a))$, without risking out-of-distribution actions.
170 | 
171 | ### Conservative Q-learning
172 | 
173 | use conservative Q-learning to avoid overestimation, we can use a conservative estimate of $Q$-value
174 | 
175 | $$
176 | \hat{Q}^{\pi}=\arg\min_Q\max_\mu\alpha\mathbb{E}_{s\sim D,a\sim\mu(a|s)}[Q(s,a)]+\mathbb{E}_{(s,a,s')\sim\mathcal{D}}[(Q(s,a)-(r(s,a)+\gamma\mathbb{E}_\pi[Q(s',a')]))^2]
177 | $$
178 | 
179 | where $\mu$ is a distribution over actions, $\alpha$ is a hyperparameter. The first term push down $Q$-values, the second term is the Bellman error. A better bound is
180 | 
181 | $$
182 | \hat{Q}^{\pi}=\arg\min_Q\max_\mu\mathcal{L}_{\text{CQL}}(Q,\mu)=\\
183 | 
184 | \arg\min_Q\max_\mu\alpha\mathbb{E}_{s\sim D,a\sim\mu(a|s)}[Q(s,a)]-\alpha\mathbb{E}_{(s,a)\sim\mathcal{D}}[Q(s,a)]+\mathbb{E}_{(s,a,s')\sim\mathcal{D}}[(Q(s,a)-(r(s,a)+\gamma\mathbb{E}_\pi[Q(s',a')]))^2]
185 | $$
186 | 
187 | Intuition: with term $\mathbb{E}_{s\sim D,a\sim\mu(a|s)}[Q(s,a)]-\mathbb{E}_{(s,a)\sim\mathcal{D}}[Q(s,a)]$, it's easy for us to decrease $Q$-values for actions that are not taken in the data, which also decrease the probability that $\mu$ will choose these actions
188 | 
189 | no longer guarantee $\hat{Q}^{\pi}(s,a)\leq Q^{\pi}(s,a)$ for all $(s,a)$, but guarantee $\mathbb{E}_{\pi(a|s)}(\hat{Q}^\pi(s,a))\leq\mathbb{E}_{\pi(a|s)}(Q^\pi(s,a))$ for all $s\in\mathcal{D}$. It's more stable
190 | 
191 | 1. Update $\hat{Q}^{\pi}$ using $\mathcal{D}$ w.r.t $\max_\mu\mathcal{L}_{\text{CQL}}(Q,\mu)$
192 | 2. Update policy $\pi$ using $\theta\leftarrow\theta+\alpha\nabla_\theta\sum_i\mathbb{E}_{a\sim\pi(a|s)}[\hat{Q}^\pi(s_i,a)]$ if actions are continuous, use $\pi(a|s)=\delta(a=\arg\max_{a'}\hat{Q}^\pi(s,a))$ if actions are discrete
193 | 3. repeat until convergence
194 | 
195 | Improvement: add regularization term to $\mathcal{L}_{\text{CQL}}$
196 | 
197 | $$
198 | \mathcal{L}_{\text{CQL}}(Q,\mu)=\alpha\mathbb{E}_{s\sim D,a\sim\mu(a|s)}[Q(s,a)]-\alpha\mathbb{E}_{(s,a)\sim\mathcal{D}}[Q(s,a)]-\mathcal{R}(\mu)+\mathbb{E}_{(s,a,s')\sim\mathcal{D}}[(Q(s,a)-(r(s,a)+\gamma\mathbb{E}_\pi[Q(s',a')]))^2]
199 | $$
200 | 
201 | a common choice is $\mathcal{R}(\mu)=\beta\mathbb{E}_{s\sim\mathcal{D}}[\mathcal{H}(\mu(\cdot|s))]$
202 | 
203 | ### Model Based Offline RL
204 | 
205 | Since data is fixed, we must punish the policy for exploiting. $\tilde{\pi}(s,a)=r(s,a)-\lambda u(s,a)$, where $u(s,a)$ is the uncertainty penalty
206 | 
207 | Conservative Model-Based RL(COMBO):
208 | 
209 | $$
210 | \hat{Q}^{k+1}\leftarrow\arg\min_Q\beta\mathbb{E}_{s,a\sim\rho(s,a)}[Q(s,a)]-\beta\mathbb{E}_{(s,a)\sim\mathcal{D}}[Q(s,a)]+\frac{1}{2}\mathbb{E}_{(s,a,s')\sim\mathcal{D}}[(Q(s,a)-\hat{B}^\pi\hat{Q}^k(s,a))^2]
211 | $$
212 | 
213 | where $\hat{B}^\pi$ is the Bellman operator, $\rho(s,a)$ is state-action distribution from the model
214 | 
215 | Intuition: if the model produces something that looks clearly different from real data, it's easy for the $Q$-learning to make it look bad
216 | 
217 | Trajectory Transformer: train a joint state-action model $p_\beta(\tau)=p_\beta(s_1,a_1,...,s_T,a_T)$, then use a big expressive model (Transformer)
218 | 
219 | ![Alt Text](pic/Transformer.png)
220 | 
221 | use beam search w.r.t $\sum_t r(s_t,a_t)$ to get best trajectory
222 | 


--------------------------------------------------------------------------------
/lecture/notes-en/8-RL-Theory.md:
--------------------------------------------------------------------------------
  1 | ## RL Theory
  2 | 
  3 | RL Theory problems: If I use this algorithm with $N$ samples, $k$ iterations, how is the result ? If I use this exploration strategy, how high is my regret ?
  4 | 
  5 | ### Model based
  6 | 
  7 | Oracle exploration: for every $(s,a)$, sample $s'\sim P(s'|s,a)$ $N$ times, $\hat{P}(s'|s,a)=\frac{\# (s,a,s')}{N}$. Given a strategy $\pi$, use $\hat{P}$ to estimate $\hat{Q}^\pi$:
  8 | 
  9 | 1. how close is $\hat{Q}^\pi$ to $Q^\pi$ ? $||\hat{Q}^\pi(s,a)-Q^\pi(s,a)||_\infty\leq\epsilon$
 10 | 2. how close is $\hat{Q}^*$ if we learn it using $\hat{P}$ ? $||Q^*(s,a)-\hat{Q}^*(s,a)||_\infty\leq\epsilon$
 11 | 3. how good is the resulting policy $\hat{\pi}$ ? $||Q^*(s,a)-Q^{\hat{\pi}}(s,a)||_\infty\leq\epsilon$
 12 | 
 13 | Using Hoeffding's inequality, we can get the sample complexity
 14 | 
 15 | $$
 16 | ||\hat{P}(s'|s,a)-P(s'|s,a)||_1\leq c\sqrt{\frac{|S|\log(1/\delta)}{N}}
 17 | $$
 18 | 
 19 | with probability $1-\delta$
 20 | 
 21 | Use $P$ denote the transition matrix $P(s'|s,a)$, $\Pi$ denote probability distribution over policies, hence
 22 | 
 23 | $$
 24 | Q^\pi=r+\gamma PV^\pi\\
 25 | 
 26 | V^\pi=\Pi Q^\pi
 27 | $$
 28 | 
 29 | hence denote $P^\pi=P\Pi$, and
 30 | 
 31 | $$
 32 | Q^\pi-\hat{Q}^\pi=(I-\gamma P^\pi)^{-1}r-(I-\gamma\hat{P}^\pi)^{-1}r\\
 33 | 
 34 | =\gamma(I-\gamma\hat{P}^\pi)^{-1}(P^\pi-\hat{P}^\pi)Q^\pi=\gamma(I-\gamma\hat{P}^\pi)^{-1}(P-\hat{P})V^\pi
 35 | $$
 36 | 
 37 | where $(I-\gamma\hat{P}^\pi)^{-1}$ is evaluation operator, $(P-\hat{P})$ is the estimation error, and $V^\pi$ is the true value
 38 | 
 39 | Lemma: given $P^\pi$ and any vector $v\in\mathbb{R}^{|S||A|}$, since $P^\pi$ is a stochastic matrix, we have
 40 | 
 41 | $$
 42 | ||(I-\gamma P^\pi)^{-1}v||_\infty\leq\frac{||v||_\infty}{1-\gamma}
 43 | $$
 44 | 
 45 | simplify above equation
 46 | 
 47 | $$
 48 | ||Q^\pi-\hat{Q}^\pi||_\infty\leq\frac{\gamma}{1-\gamma}||(P-\hat{P})V^\pi||_\infty\leq\frac{\gamma}{1-\gamma}\left(\max_{s,a}||P(\cdot|s,a)-\hat{P}(\cdot|s,a)||_1\right)||V^\pi||_\infty
 49 | $$
 50 | 
 51 | w.l.o.g suppose $R_{\max}=1$, then $||V^\pi||_\infty\leq\frac{1}{1-\gamma}$, use union bound
 52 | 
 53 | $$
 54 | \Pr\{\max_{s,a}||P(\cdot|s,a)-\hat{P}(\cdot|s,a)||_1\leq c\sqrt{\frac{|S|\log(1/\delta)}{N}}\}\geq1-|S||A|\delta
 55 | $$
 56 | 
 57 | hence with probability $1-\delta$
 58 | 
 59 | $$
 60 | ||Q^\pi-\hat{Q}^\pi||_\infty\leq c\frac{\gamma}{(1-\gamma)^2}\sqrt{\frac{|S|\log(|S||A|/\delta)}{N}}
 61 | $$
 62 | 
 63 | Note: this establishes for any fixed policy $\pi$, since the uncertainty of $\hat{P}$ is independent of $\pi$
 64 | 
 65 | Return the problem above
 66 | 
 67 | $$
 68 | ||Q^*(s,a)-\hat{Q}^*(s,a)||_\infty\leq ||\sup_{\pi}Q^\pi-\sup_{\pi}\hat{Q}^\pi||_\infty\leq\sup ||Q^\pi-\hat{Q}^\pi||_\infty\leq c\frac{\gamma}{(1-\gamma)^2}\sqrt{\frac{|S|\log(|S||A|/\delta)}{N}}\\
 69 | 
 70 | ||Q^*(s,a)-Q^{\hat{\pi}}(s,a)||_\infty\leq ||Q^*(s,a)-\hat{Q}^{\hat{\pi}}||_\infty+||\hat{Q}^{\hat{\pi}}-Q^{\hat{\pi}}||_\infty\leq2c\frac{\gamma}{(1-\gamma)^2}\sqrt{\frac{|S|\log(|S||A|/\delta)}{N}}
 71 | $$
 72 | 
 73 | ### Model Free
 74 | 
 75 | Analyzing fitted $Q$-iteration: define $\hat{T}Q=r+\gamma P\max_a Q$ as Bellman operator
 76 | 
 77 | $$
 78 | \hat{Q}_{k+1}\leftarrow\arg\min_Q||\hat{Q}-\hat{T}\hat{Q}_k||_\infty
 79 | $$
 80 | 
 81 | no convergence guarantee if using $||\cdot||_2$
 82 | 
 83 | Error come from sampling error $T\neq\hat{T}$ and approximation error $\hat{Q}_{k+1}\neq\hat{T}\hat{Q}_k$
 84 | 
 85 | $$
 86 | |\hat{T}Q(s,a)-TQ(s,a)||=|\hat{r}(s,a)-r(s,a)+\gamma\left(\mathbb{E}_{\hat{P}(s'|s,a)}[\max_{a'}Q(s',a')]-\mathbb{E}_{P(s'|s,a)}[\max_{a'}Q(s',a')]\right)\\
 87 | 
 88 | \leq |\hat{r}(s,a)-r(s,a)|+\gamma||\hat{P}(\cdot|s,a)-P(\cdot|s,a)||_1||Q||_\infty\\
 89 | 
 90 | \leq c_1 R_{\max}\sqrt{\frac{\log1/\delta}{2N}}+c_2 ||Q||_\infty\sqrt{\frac{\log1/\delta}{N}}
 91 | $$
 92 | 
 93 | Obviously, $||Q||_\infty=O(\frac{1}{1-\gamma}R_{\max})$. Using union bound
 94 | 
 95 | $$
 96 | \Pr\left\{||\hat{T}Q-TQ||_\infty\leq c\frac{R_{\max}}{1-\gamma}\sqrt{\frac{\log |S||A|/\delta}{N}} \right\} \geq1-\delta
 97 | $$
 98 | 
 99 | Approximation error
100 | 
101 | $$
102 | ||\hat{Q}_{k}-Q^*||_\infty\leq ||\hat{Q}_{k}-T\hat{Q}_{k-1}||_\infty+||T\hat{Q}_{k-1}-TQ^*||_\infty\\
103 | 
104 | \leq\epsilon_{k-1}+\gamma ||\hat{Q}_{k-1}-Q^*||_\infty
105 | $$
106 | 
107 | using fact that $T$ is a $\gamma$-contraction mapping, then
108 | 
109 | $$
110 | \lim_{k\to\infty}||\hat{Q}_{k}-Q^*||_\infty\leq\lim_{k\to\infty}\left(\sum_{i=0}^{k-1}\gamma^i\epsilon_{k-i-1}+\gamma^k||\hat{Q}_0-Q^*||_\infty\right)\leq\frac{1}{1-\gamma}\max_k\epsilon_k
111 | $$
112 | 
113 | $\epsilon_k$ measures how much $\hat{Q}_{k+1}$ deviates from $T\hat{Q}_k$
114 | 
115 | $$
116 | ||\hat{Q}_{k+1}-T\hat{Q}_k||_\infty\leq ||\hat{Q}_{k+1}-\hat{T}\hat{Q}_k||_\infty+||\hat{T}\hat{Q}_k-T\hat{Q}_k||_\infty
117 | $$
118 | 
119 | first term depends on learning process, second term can be estimated using previous result
120 | 


--------------------------------------------------------------------------------
/lecture/notes-en/9-Generative-Model.md:
--------------------------------------------------------------------------------
 1 | ## Generative Model
 2 | 
 3 | Latent variable models: generate using $p_\theta(x|z)\sim\mathcal{N}(f(z;\theta),I)$
 4 | 
 5 | $p(x)=\int p(x|z)p(z)dz$, where $p(z)$ is a prior. Using maximum likelihood
 6 | 
 7 | $$
 8 | \theta\leftarrow\arg\max_\theta\sum_i\log p_\theta(x_i)=\arg\max_\theta\sum_i\log\int p_\theta(x_i|z)p(z)dz
 9 | $$
10 | 
11 | however, the integral is intractable, one way to estimate is
12 | 
13 | $$
14 | \theta\leftarrow\arg\max_\theta\sum_i\mathbb{E}_{z\sim p(z|x_i)}\left[\log p_\theta(x_i,z)\right]
15 | $$
16 | 
17 | intuition: we choose a $z$ with highest probability given $x_i$, then use $z$ to estimate the integral. However, $z$ is high-dimensional, we sample from $p(z|x_i)$
18 | 
19 | Left problem: how to sample from $p(z|x_i)$ ?
20 | 
21 | Variational Inference:
22 | 
23 | $$
24 | \log p(x_i)=\log\int p_\theta(x_i|z)p(z)dz=\log\mathbb{E}_{z\sim q_i(z)}\left[\frac{p_\theta(x_i|z)p(z)}{q_i(z)}\right]\\
25 | 
26 | =\mathbb{E}_{z\sim q_i(z)}\left[\log p_\theta(x_i|z)+\log p(z)\right]+\mathcal{H}(q_i)+D_{KL}(q_i(z)||p_\theta(z|x_i))\\
27 | 
28 | =\mathcal{L}(p_\theta, q_i)+D_{KL}(q_i(z)||p_\theta(z|x_i))
29 | $$
30 | 
31 | we want to use $\mathbb{E}_{z\sim q_i(z)}\log\left[\frac{p_\theta(x_i|z)p(z)}{q_i(z)}\right]$ to estimate $\log p(x_i)$, the difference is KL-divergence. We want to minimize the difference
32 | 
33 | Minimizing $D_{KL}(q_i(z)||p(z|x_i))$ is equivalent to maximizing $\mathbb{E}_{z\sim q_i(z)}\log\left[\frac{p(x_i|z)p(z)}{q_i(z)}\right]$
34 | 
35 | Choose $q_i(z)$ to be a Gaussian $\mathcal{N}(\mu_i,\Sigma_i)$, our algorithm work as
36 | 
37 | 1. sample $z\sim q_i(z)$, update $\theta$ by $\nabla_\theta\mathcal{L}(p_\theta,q_i)\approx\nabla_\theta\log p_\theta(x|z)$
38 | 2. update $q_i$ to maximize $\mathcal{L}(p_\theta,q_i)$
39 | 
40 | Amortized Variational Inference: Instead of learning $q_i(z)$ for each $x_i$, we learn $q_\phi(z|x)=\mathcal{N}(\mu_\phi(x),\Sigma_\phi(x))$ to approximate $p(z|x)$
41 | 
42 | Representation Learning:
43 | 
44 | 1. Train VAE on states in replay buffer $\mathcal{R}$
45 | 2. Run RL, using $z$ as the state instead of $s$
46 | 
47 | Conditional Models: use decoder $p_\theta(x|y,z;\theta)$ and encoder $q_\phi(z|x,y,\phi)$, where $y_i$ is the label
48 | 
49 | $$
50 | \mathcal{L}_i=\mathbb{E}_{z\sim q_\phi(z|x_i,y_i)}[\log p_\theta(x_i|y_i,z)+\log p(z|y_i)]+\mathcal{H}(q_\phi(z|x_i,y_i))
51 | $$
52 | 


--------------------------------------------------------------------------------
/lecture/notes-en/pic/LM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-en/pic/LM.png


--------------------------------------------------------------------------------
/lecture/notes-en/pic/Qlearning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-en/pic/Qlearning.png


--------------------------------------------------------------------------------
/lecture/notes-en/pic/RNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-en/pic/RNN.png


--------------------------------------------------------------------------------
/lecture/notes-en/pic/Transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-en/pic/Transformer.png


--------------------------------------------------------------------------------
/lecture/notes-en/pic/control.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-en/pic/control.png


--------------------------------------------------------------------------------
/lecture/notes-zh/0-preliminaries.md:
--------------------------------------------------------------------------------
  1 | # General
  2 | 
  3 | 本仓库是我学习Berkeley的课程[CS285(DRL)](http://rail.eecs.berkeley.edu/deeprlcourse/)时候记录总结的自学笔记。
  4 | 
  5 | 在全部笔记完工一个月后，我再次阅读之前的笔记时，发现存在着很多地方逻辑转折不是十分通顺。在修改原来笔记的同时，我也正在尝试建立一个更加自然的体系，这一体系放在[tutorials](/tutorials)文件夹下。如果感兴趣也可以参考一下。
  6 | 
  7 | 我的目标是，建立两个平行（虽然内容可能有很大公共部分，但是阅读起来完全独立）的两个体系：
  8 | - （很早就建立的）[笔记](/lecture/notes)
  9 |     - 特点是紧贴CS285课程的内容，以CS285的一讲为一个单位；
 10 | - 最近实验性质，处于开发阶段的[tutorial](/tutorials)
 11 |     - 特点是更加自然的体系，更加适合初学者理解。同时，包含大量的代码实现。
 12 | 
 13 | # Tutorial
 14 | 
 15 | Tutorial由若干个ipynb组成。同时，这里的介绍顺序按照逻辑重新整理，因此可能和CS285的课程顺序有所不同。如果阅读Tutorial，则不需要阅读本文件最下面的[Table of Contents](#table-of-contents)和之后的内容。
 16 | 
 17 | # Notes
 18 | 
 19 | “笔记”部分基本可以说包含了PPT和讲课上的大部分内容，也成一个体系。我个人所希望的也是，经过我们大家一起合作，仅参考“笔记”就可以完成CS285的学习，比听课更加快速，又比单纯看PPT更为轻松。（我自己听了课，因此教授讲的而PPT上面没有的一些内容我也记录上来了）
 20 | 
 21 | 笔记文件可以在[这里](/lecture/notes)找到（为了最好的阅读体验，建议下载仓库并用VS code阅读。如果在Github上面阅读，请不要点击这个link，而是点击下面Table of Contents的link）。
 22 | 
 23 | 本笔记的架构：每一讲有一个单独的笔记。此外，[takeaway](/lecture/notes/takeaway.md)总结了所有讲的要点，类似于一个“cheatsheet”。在每一讲的最后，可能会有一个"Reference Papers"板块，它介绍了一些在笔记内容之外的，更加advanced的话题的参考来源。每篇论文后有一个简述，但因为我~~事实上都没读过~~不太了解，所以不一定靠谱，仅供参考。
 24 | 
 25 | 此外，如果有些笔记引用了作业的内容，你可以在[这里](https://github.com/Hidden-Hyperparameter/CS285_homework)找到。这就是CS285这一课程本身的作业。
 26 | 
 27 | # For Contributors
 28 | 
 29 | 首先，欢迎加入tutorial的开发！
 30 | 
 31 | 本笔记基本由我一个人完成，因此肯定有大量的错误。同时，我只是RL领域的初学者，还未入门，因此对很多算法的个人理解也很可能有偏差。如果你发现了错误，或者有任何建议，欢迎提出issue或者PR！
 32 | 
 33 | 如果这个仓库逐渐有了公开的性质（而不是只有我自己看），可能会考虑推出一个英文版本。（Not Implemented）
 34 | 
 35 | # Table Of Contents
 36 | 
 37 | 这里是笔记的目录，和Tutorial无关。
 38 | 
 39 | Takeaway/Cheatsheet: [Here](/lecture/notes/takeaway.md)
 40 | 
 41 | 0. Preliminaries (just this file)
 42 | 1. What is RL (not implemented)
 43 | 2. Imitation Learning [Here](/lecture/notes/2-imitation_learning.md)
 44 | 3. Pytorch Basics [(Not complete)](/lecture/notes/3-pytorch.md)
 45 | 4. Introduction to RL [Here](/lecture/notes/4-intro2RL.md)
 46 | 5. Policy Gradients [Here](/lecture/notes/5-policy_grad.md)
 47 | 6. Actor Critic Algorithms [Here](/lecture/notes/6-actor-critic.md)
 48 | 7. Value Function Methods [Here](/lecture/notes/7-value_func.md)
 49 | 8. Q Learning (advanced) [Here](/lecture/notes/8-Q_learning.md)
 50 | 9. Advanced Policy Gradients [Here](/lecture/notes/9-advanced_policy_grad.md)
 51 | 10. Optimal Control and Planning [Here](/lecture/notes/10-optimal_control_planning.md)
 52 | 11. Model-based RL [Here](/lecture/notes/11-model-based.md)
 53 | 12. Model-based RL with a Policy [Here](/lecture/notes/12-model-based-with-policy.md)
 54 | 13. Exploration (1) [Here](/lecture/notes/13-exploration_1.md)
 55 | 14. Exploration (2) [Here](/lecture/notes/14-exploration_2.md)
 56 | 15. Offline RL (1) [Here](/lecture/notes/15-offline-RL_1.md)
 57 | 16. Offline RL (2) [Here](/lecture/notes/16-offline-RL_2.md)
 58 | 17. RL Theory [Here](/lecture/notes/17-RL-theory.md)
 59 | 18. Variational AutoEncoder [Here](/lecture/notes/18-vae.md)
 60 | 19. Soft Optimality [Here](/lecture/notes/19-soft-optimality.md)
 61 | 20. Inverse RL [Here](/lecture/notes/20-IRL.md)
 62 | 21. RL and Language Models [Here](/lecture/notes/21-RL-LM.md)
 63 | 22. Transfer Learning and Meta Learning [Here](/lecture/notes/22-transfer-meta.md)
 64 | 23. Challenges & Open Problems [Here](/lecture/notes/23-challenge.md)
 65 | 
 66 | 
 67 | # Preliminaries
 68 | 
 69 | 学习RL，我们需要什么？
 70 | 
 71 | 1. 一些DL的基本知识。 这个介绍DL的[repo](https://github.com/szjzc2018/dl)是一个非常好的仓库，欢迎给它点star。
 72 | 
 73 | 2. 做好**记号混乱**的心理准备。如果学习过DL，就应该发现以下的场景是十分常见的：
 74 |     - 一个符号有多个完全不同的含义；
 75 |     - 多个符号代表完全相同的含义；
 76 |     - 前一页的符号在后一页变了；
 77 |     - 期待值不说明从哪个分布采样；
 78 |     - 多个不同的概率分布全部用记号 $p$ 表示；
 79 |     - 还有最常见的：公式存在重要的typo。比如，在某些地方， $p^\star$ 和 $p_\star$ 代表两个完全不同的意思，但又在某一处一不小心写反了。
 80 | 
 81 | 我们会**尽量避免**这些现象发生，但必须先打好这一预防针。这也不是任何人的问题——很多试图把这些记号变得清楚整洁的尝试都会大概率因为发现公式变得长得离谱以至于令人无法忍受而告终。所以，在混乱的记号中理解它们的“深意”，这一能力也是RL即将为我们培训的，十分重要的技能之一:)
 82 | 
 83 | tl;dr: 接下来，是几个十分基础的介绍或问题，旨在介绍RL的基本概念，建立起一个“形象”。如果您已经了解RL的基本任务，完全可以跳过。
 84 | 
 85 | # What is Reinforcement Learning?
 86 | 
 87 | > Reinforcement learning (RL) is an **interdisciplinary area** of **machine learning** and **optimal control** concerned with how an intelligent agent **ought to take actions** in a **dynamic environment** in order to **maximize the cumulative reward**. Reinforcement learning is one of three basic machine learning paradigms, alongside supervised learning and unsupervised learning. (Wikipedia)
 88 | 
 89 | Wikipedia给出的对RL的定义十分直观。我们来总结一下，一个一般的RL问题由是**agent**（或者称为**policy**）和环境**environment**构成：
 90 | - agent能做的是在某个给定的状态（**state**）下面给出操作（**action**）；
 91 | - 每作出一个action，环境会给出一个反应，也就是环境会进入下一个状态；同时，我们认为agent会得到一个环境赋予的**reward**；
 92 | - agent的目标是最大化无限轮这样的操作之后的**cumulative reward**（当然，实际中可能会在若干轮后停止）。这一reward通常由环境给出。
 93 | - 环境的特性是如何根据state,action给出下一个state和reward。这称为环境的动力学（**dynamics**）。
 94 | 
 95 | 我们为什么要定义这样的一个问题？当然是因为，这一抽象很好地描述了生活中的许多和人们所认同的“智能”相关的现象。
 96 | - 人们可以训练动物，使得它们做出一些特定的动作。如何训练呢？比如，小狗站起来，就喂它一把狗粮吃，否则就吓唬它一下；久而久之，小狗就会学会站起来。人们会说，“这个小狗真聪明！”——为什么？因为，它其实处理了一个很复杂的RL问题：
 97 |     - 它的action是十分复杂的，也许可以建模为大脑中成千上万神经元的刺激信号；
 98 |     - 它的dynamics更是复杂。从神经元如何将信号传输到一根根神经上，进一步控制肌肉的收缩；这再进一步和复杂的物理世界交互，由弹力、摩擦力、重力等等控制的一个极度耦合、非线性的偏微分方程进行时间演化；最终，它得到的reward只是0或1；
 99 |     - 在失败一次之后它不会放弃，因为它最大化的是累积的reward，它学会利用把历史的失败教训存在它的“参数”里，作出更好的决策，最终达到目标。
100 | - 人类自己就更是如此。比如说，在高中，我们会根据考试的成绩或排名（某种“reward”）反思，进一步改进自己的学习方法这一policy；在人际交往中，我们也会根据对方做的事情来调整自己的行为，等等。
101 | 
102 | 甚至对于新生的AI，和“环境”的交互也是“智能”最终的体现。
103 | 
104 | > 在OpenAI推出GPT3的时候，研究者们看了看它，说：“这个模型真不错”；在推出ChatGPT的时候，连你的奶奶都跑过来跟你说：“这个模型真不错”！你也看出来了——这之间是有很大的差距的。这一差距是什么？就是RL。正是RL让GPT3从一个普通的大语言模型变成一个可以和人对话、交互并了解人的喜好的"chatbot"。 ——Eric Mitchell, on the guest lecture of CS285
105 | 
106 | 因此，我们更加清楚了RL这一领域的重要性。实际上，它的发展也主要集中在两个领域：
107 | - RL作为一个算法问题，如何解决？
108 | - 我们是否可以用RL的思想来解释人类或动物的行为？
109 | 
110 | 当然，作为学习计算机的人，我们更关心的是第一个问题。本笔记就会从最基本的思想出发，带着大家一起解锁各个算法，建立一个解决RL问题的较全面的框架。但在开始之前，我们先来看几个常见的问题。
111 | 
112 | # We already have Physics. Why do we need RL (for control) anyway?
113 | 
114 | > Q: 你搞个RL模型下下棋玩玩游戏差不多得了，我不懂不多作评价。连开车、发射火箭，你也要用你那个什么算法来？你模型几亿个参数我不懂，但物理规律在哪里呢？你发射火箭的时候，大到牛顿定律、拉格朗日方程，小到空气阻力系数甚至火箭的质量，在你的一堆只有线性函数和relu的模型里哪一个体现出来了？我凭什么相信你的甚至要指明随机数种子的算法，而不相信我几千个方程式严格联立求解出来的纯物理的控制方法呢？
115 | >
116 | > A: 物理的方法当然合理，因为它们相当于人类完成了RL的"pretraining"。我们也无法否认物理方法在各种领域应用广泛，比如发射火箭，就可以完全通过物理的控制来完成。
117 | >
118 | > 但是，物理方法主要有两个问题。第一，你能否完全保证一切都在物理的控制范围之内吗？比如，对于自动驾驶，当你获得一张驾驶座前拍的一张照片，单纯的物理学并不能帮你把你前方的人、车、交通标志、道路都分清楚，确定下来。所以，这里必须就要有ML的方法介入了。
119 | >
120 | > 第二，就算对于完全完美的物理系统，纯粹物理的方法也不一定可行。比如，你为一只机器狗装了无数的传感器，它现在可以精准地定位它脚下沙滩上的每一颗沙子的位置、形状、大小和摩擦系数。再假设你经过研究可以高效地求解这个复杂的物理系统。但是假如突然脚下出现了一块小石头，你的系统就完全错误了。发射火箭能成功，是因为在高空意外很少见；而一旦意外出现，出一个错，动力学系统的混沌效应就会让物理的控制方法很难恢复。对于同样的问题，相反，对于大部分的RL算法，它把环境抽象为一个黑盒，发生意外情况的时候，agent会感知，并且恢复。这就是RL的优势所在。
121 | 
122 | 物理和RL的关系，就像图片识别中专家方法和卷积网络的关系一样。作为二十一世纪的现代化公民，我们应该逐渐意识到神经网络的参数里蕴含的无穷能力了。
123 | 
124 | # What's the difference between RL and DL (in the root)?
125 | 
126 | RL和DL有什么区别呢？当然，我们知道，RL和环境有关，也有随机性；但它毕竟也是一个对确定的objective进行优化的过程。那么，RL和我们之前接触的DL（或者说，supervised learning和unsupervised learning）在**实质性**上真的有不同之处吗？
127 | 
128 | ## From the results
129 | 
130 | 在具体详细地分析之前，我们先从结果上来看，用一句话来引入：
131 | 
132 | > Generative models are impressive because the images look like something **a person might draw**; RL models are impressive because **no person had thought of their strategies before**. ——[Sergey Levine](https://scholar.google.com/citations?user=8R35rCwAAAAJ&hl=en&oi=ao), the instructor of CS285
133 | 
134 | 这句话很好地概括了RL和DL（在目标上）的区别。
135 | - DL所做的是**模仿**，这件事情**有标准答案**。比如，对于generative model，它的最终任务就是学会数据集分布 $p(x)$ 。人们夸赞它，夸赞它模仿地惟妙惟肖，就像是人一样。
136 | - 而RL所做的是**决策**，这件事情**没有标准答案**。我们也已经看到，agent的目的是最大化cumulative reward。如何能最大化这一reward呢？每一步又该take什么action呢？这不仅没有答案，甚至在理论上都不一定是唯一的。人们夸赞RL agent，夸赞它们作出的举动是如此的“新颖”，与普通人的方法完全不同，甚至于超越了人类的认知。
137 |     - 比如，在当年AlphaGo战胜李世石的时候，agent给出了著名的"Move 37"，当时所有的围棋专家都无法理解这一步的意义，这就是RL的魅力所在。
138 | 
139 | 当然，从目标上来看，还有一个更为重要的区别：**generalization**。我们在这里暂且不描述，当我们去实践RL的具体任务的时候，我们就会真正体会到它们的差异。（如果你很好奇，可以直接去看看最后一讲的[总结](./23-challenge.md#generalization)）
140 | 
141 | ## From the process
142 | 
143 | 从（训练）过程上，RL和DL也有很多细节上的差异。这些差异或多或少是重要的，你也可以发现，很多RL“算法”就是为了减轻这些差异带来的问题而设计的。
144 | 
145 | - **数据来源**：
146 |     - 在DL中，我们被给定训练数据集，我们在上面训练我们的模型；
147 |     - 在RL中，开始我们一无所有。我们需要自己想如何和环境交互，获得数据。我们还要平衡获得新数据的过程和在老数据上面做类似DL的训练的过程，即平衡**exploration**和**exploitation**。
148 | - **数据的关联性**：即使数据都采集完成了，训练也并不相同：
149 |     - 在DL中，我们认为数据是i.i.d.的。可以发现很多DL算法实际上隐式地依赖（或默认了）这一点；
150 |     - 但在RL中，数据之间是**相关**的。因为我们和环境的交互方式是，每一次环境给我们一个next state，我们就在那一个state上面继续活动。也就是说，我们采集到的数据是一条轨迹（**trajectory**）。在这样的数据里面，前后肯定是有关联的。
151 | - **额外的和环境交互的代价**：
152 |     - 你之后跑RL训练的时候，你会发现你的神经网络出奇的小（比如，就几千个参数），但训练又出奇的慢。为什么？实际上很大的时间花费在了`env.step(actions)`这一函数上。这一函数的作用就是，输入action，环境计算并给你next state和reward。如果你使用`gym`这个包，那么恭喜你，这一计算必须在CPU上使用`np.ndarray`进行；你还总是需要把`np.ndarray`和`torch.Tensor`相互转化，这都是你的GPU利用率很低的原因。
153 |     - 这其实已经算好了，因为我们至少还只是用simulator在电脑上操作；如果要训练在实际世界里的机器人，那一秒就是货真价实的一秒！
154 |     - 因此，在RL，必须注意一件事叫做**sample efficiency**，也就是和环境交互的次数不应该过多。这又会进一步和sample的质量形成tradeoff……
155 | 
156 | 至此，应该可以看出DL和RL的多方面区别了。其实，这里列举的还不是全部，我们可以在具体的算法中进一步思考它们的区别。
157 | 
158 | # Wait... So what do you mean by "Deep" RL?
159 | 
160 | 的确——RL实际上具有比你的想象更加悠久的历史，远远早于神经网络的发现。但是，当时人们的模型一般都是线性的。就是最近，人们才想到**把RL和DL结合起来**，用神经网络来表示policy。这就是Deep RL。
161 | 
162 | 直观上，把RL搞得Deep一些肯定是有益无害；但事实上有很subtle的事情会发生。我们会在后面的笔记中进一步讨论这一点。
163 | 
164 | # Anyway... Let's get [started](/lecture/notes/2-imitation_learning.md)!


--------------------------------------------------------------------------------
/lecture/notes-zh/11-model-based.md:
--------------------------------------------------------------------------------
  1 | # Model-Based RL
  2 | 
  3 | 在上一讲，我们已经介绍了如果我们有一个环境的模型，我们可以使用十分强大的算法来计算最优策略。因此，很多时候我们希望**建立一个模型**模拟环境的动力学，进而使用前面的方法。
  4 | 
  5 | 一个最直接的思路：
  6 | 
  7 | 1. 运行某种baseline policy $\pi_0$ （比如随机采样）, 收集数据集 $D=\{(s,a,s')\}$ ；
  8 | 2. 在 $D$ 上学习一个动力学模型 $f(s,a)$ ，使得 $f(s,a)\approx s'$ ；
  9 | 3. 使用前面介绍的方法（比如LQR）来计算最优策略 $\pi^\star$ 。
 10 | 
 11 | 但这个方法有一定的问题：前面的baseline policy $\pi_0$ 因为没有任何策略，很可能不能探索到全部的state；但 $\pi^\star$ 因为是更优秀的，很可能到达 $\pi_0$ 没有到达的地方，因此有可能会不知所措。
 12 | 
 13 | > 比如说，一个游戏上来先有一些小兵，击败它们后会出现一个boss。如果我们的baseline policy只是随机走动，那么用它收集出来的数据训练出来的模型 $f$ 很可能只能学会小兵的动力学。这样，用 $f$ 计算出来的 $\pi^\star$ 虽然可以击败小兵，但是完全不可能知道如何击败boss。
 14 | 
 15 | 这个问题被称为**distribution mismatch**。当然，我们很容易就可以给前面的方法做一个改进来减缓这个问题。想象上面的例子，只要我们每一次plan得到新的策略之后重新采集数据，就可以覆盖更多的state。这样，我们就得到了以下的方法：
 16 | 
 17 | 1. 初始化策略为某种baseline policy $\pi_0$ （比如随机采样）。开始时，数据集 $D=\emptyset$ ；
 18 | 2. 重复：
 19 |     1. 运行当前的最新策略 $\pi$ , 收集数据集 $D=D\cup \{(s,a,s')\}$ ；
 20 |     2. 在 $D$ 上学习一个动力学模型 $f(s,a)$ ，使得 $f(s,a)\approx s'$ ；
 21 |     3. 使用前面介绍的方法（比如LQR）来更新最优策略 $\pi$ 。
 22 | 
 23 | 这个方法在理想情况下感觉可以跑的很好。但再仔细一想，如果学习环境的动力学模型出现错误，那么使用这个算法很可能会浪费很多数据。具体地，我们的LQR方法是基于模型的；如果模型出现了一定的误差，那么计算得到的 $\pi$ 的第一步差得还不多，但越走就会差得越远。
 24 | 
 25 | > 比如说，假设我们的模型要模拟开车的动力学。模型本身很可能只是有一个很小的误差（比如说，它认为车轮向左偏1度的时候才会让车直着走）。但是用这个模型计算最优策略（比如说目标就是让车直着走），那么这个策略走出来的轨迹就会越来越偏离目标。
 26 | 
 27 | 这样，用存在很小误差的动力学走很多步，会把误差放大，采集出来的数据都很偏离目标，因此意义不大。
 28 | 
 29 | 很容易想到，我们为了改变这个情况，需要给模型及时更正自己错误的机会。这样，我们就终于给出了Model-Based RL的一个基本思路：
 30 | 
 31 | > **Model-Based RL Algorithm** (a.k.a. Model Predictive Control,MPC)
 32 | 
 33 | 1. 用某种baseline policy $\pi_0$ （比如随机采样）获得一个数据集 $D=\{(s,a,s')\}$ ；
 34 | 2. 重复：
 35 |     1. 在 $D$ 上学习一个动力学模型 $f(s,a)$ ，使得 $f(s,a)\approx s'$ ；
 36 |     2. 使用某种planning的方法来更新最优策略 $\pi$ 。
 37 |     3. 运行当前的最新策略 $\pi$ **仅一步**, 收集 **一组（不是一个）** 数据，加入数据集： $D=D\cup \{(s,a,s')\}$ ；
 38 | 
 39 | 这里值得一提的是一个有趣的细节：我们第2步的planning该怎么做？如果最保守，我们应该运行一个完整的LQR；但实际上我们可以稍微在这一步上放松，也就是取一个小一点的horizon，而不是完整的horizon。更进一步，LQR甚至都不是必须的，使用之前最“笨”的random shooting都是可以的。
 40 | 
 41 | > Q: 为什么这里planning的准确性并不被特别要求？
 42 | >
 43 | > A: 因为和之前已知环境演化的设定不同，这里我们的模型还没有完全学会环境的动力学。因此，与其在这个不完全正确的动力学上大花功夫，不如节约一些时间，多运行这个loop几次，从而从环境中采集更多的、方向更正确的数据。同时，上一讲的最后介绍[MPC](./10-optimal_control_planning.md#example-nonlinear-model-predictive-control)的时候也提到过，越是动力学不准确，horizon比较小的算法越是有优势。
 44 | 
 45 | ## Uncertainty
 46 | 
 47 | > Q: Is that the end of the story?
 48 | > 
 49 | > A: No! There are a lots of issues that we still need to consider.
 50 | 
 51 | 我们首先来考虑一个实际的问题：我们如何选取这个 $f$ 呢？当然，基于物理定律的模型（加上几个拟合的参数）确实可以，但不够一般，也不一定准确。因此，我们肯定希望能把 $f$ 选取为一个神经网络。
 52 | 
 53 | 但这个时候就出现了一个不易察觉的问题（当然，对于非神经网络也有类似的问题，但是在神经网络的地方它体现的尤其严重）。我们的数据集虽然在不断拓展，但一开始的时候我们所有（或大部分）数据都还是来自那个最原始的base policy，也就是基本是随机选取的。在这种情况下，我们的神经网络**很容易overfit**。
 54 | 
 55 | > 就比如，我们还是希望神经网络能既学会小兵如何反应，也学会boss如何反应。但现在的情况是，我们有500组 $(s,a,s')$ ，它们都来自小兵；而我们只有10组 $(s,a,s')$ 来自boss。这样，神经网络很可能学会一个基本是小兵的动力学的模型，只是在boss的那10组数据上突然overfit一下。比如下面的图形，我们本来加入了一个数据点（红点），但莫名其妙出现了一个不该出现的“假”极值点（五角星标记的点）。
 56 | 
 57 | ![](./assets/11-1.png)
 58 | 
 59 | 你也许会说，这并没有太大的问题呀——毕竟随着策略的提升，boss的数据会越来越多，一切都会好起来的。但问题的关键在于**planner的策略和模型形成一个正反馈**。在一开始的时候，如果描述环境动力学的网络overfit，就像是游戏规则有漏洞，会导致我们的planner（比如LQR）会利用这个漏洞，带着策略走向错误的方向；而错误的策略又进一步使得模型不能把能力集中在真正reward大的地方。这样，迭代次数越多，表现越差。
 60 | 
 61 | 那么，问题的根源在于何处？容易看出，关键在于我们的planner现在**过度相信**我们的模型给出的动力学，完全按照这个动力学来作出决策。但实际上的事实是：我们的模型本身从环境获得的数据很少，预测的动力学很容易不准确。因此，我们需要对模型的**不确定性**有所考虑。
 62 | 
 63 | ### Example: Why Uncertainty Matters
 64 | 
 65 | 我们先来考虑一个简单的例子，从直觉上阐述不确定度给了我们什么样的信息。
 66 | 
 67 | 假设现在我们的目标是到达悬崖的边上。离悬崖的边缘越近，reward就越高；但是如果掉下悬崖，就会又很多负的reward。我们先假设我们完全不考虑模型的不确定性，也就是使用之前的算法。那么，类似我们之前所说的那样，模型很容易学会向悬崖的方向行走，并且在接近悬崖边缘的时候我们也能采集到很多掉下去的数据和保持在上面的数据，因此能意识到悬崖边缘的存在性。但问题是，我们的模型根据这些数据拟合出来的结果（悬崖的边界）不一定完全精确。这就可能会导致我们可能会走向一个本来模型认为在悬崖上面的位置，但实际上掉下悬崖。
 68 | 
 69 | 但设想我们现在对于每一个 $(s,a)$ 对都估计一下**模型给出的预测 $s'$ 的不确定度**。这样，在悬崖的边缘，不确定度应该很大，我们的planner也就不会贸然前进了。因此，通过这样地引入不确定性，我们就可以避免走向悬崖。
 70 | 
 71 | 接下来，我们分两个部分进行讨论：第一个部分里，我们考虑如何估计模型的不确定性；第二个部分里，我们考虑如何利用这个不确定性来改进我们的planner。
 72 | 
 73 | ### Estimating Uncertainty
 74 | 
 75 | 如何估计模型的不确定性？一个直观的想法是，模型本身的输出是一个 $s'$ 的分布，那么我们就可以估计一下模型自己对输出的confidence好了。比如说，如果模型的任务是一个十分类，我们就输出softmax之后的 $s'$ 概率。然后，我们用这个概率来进行后面的策略安排（比如说，如果49%的概率掉下悬崖，51%的概率到达目标，我们大概还是不要这样做）。
 76 | 
 77 | 但必须注意，这个直观的想法**并不正确**！在DL中，人们早已发现模型具有一种[overconfidence](https://proceedings.mlr.press/v70/guo17a/guo17a.pdf)的倾向。因此，我们必须分清两种不确定性：
 78 | 
 79 | - 模型**输出**显示出来的的不确定性（statistical uncertainty）：这是因为环境本身的动力学存在不确定性造成的那么模型的置信概率(confidence)不高。比如，如果给CIFAR上面的分类器扔一张狗站在车上的照片，那么它的输出很可能是“狗”和“车”两个类别的概率接近。换句话说，这是数据本身造成的问题；
 80 | - 模型**本身**的不确定性（model uncertainty）：这才是我们之前提到的，因为**模型训练的不确定性**造成的模型的bias。比如说，在DL训练分类器的时候，在CIFAR上面训练出来的模型有的就是分辨猫非常准，有的就是分辨狗非常准。这种不确定性是训练过程造成的，模型本身的问题。这样的不确定性显然是不能从模型的logits中看出来的。
 81 | 
 82 | 一种数学上的表述是，第一种不确定性是
 83 | 
 84 | $$
 85 | p(y|x,\theta)
 86 | $$
 87 | 
 88 | 也就是给定**当前的**模型，输出label $y$ 的概率（模型自己的confidence）有多大；而第二种不确定性是
 89 | 
 90 | $$
 91 | \mathbb{E}_{\theta \sim p(\theta|D)}[p(y|x,\theta)]
 92 | $$
 93 | 
 94 | 也就是给定整个数据集，训练出来的模型不见得是固定的，因此预测的结果也不一定相同。我们要设法估计第二种不确定性，就必须知道这个分布的细节。
 95 | 
 96 | 一个解决这个问题的方法就是**Bayesian Neural Networks(BNN)**。在这样的神经网络中，每一个weight不再是一个数字，而是一个分布（代表训练出来的神经网络）！同时，它近似地认为各个weight是独立的，并且遵循高斯分布。这样，我们就可以强行计算出 $p(\theta|D)$ 。这个topic比较复杂，我们会在之后再详细介绍。
 97 | 
 98 | 这里，我们来搞一个“耍赖”的方法：**Ensemble**。这个方法的思路很简单：我们训练出来**多个**模型。然后我们把每一个模型都跑一遍，分别计算一下 $s'$ 的分布。这样，我们就可以得到一个模型的不确定性的估计。也就是说，我们近似
 99 | 
100 | $$
101 | \mathbb{E}_{\theta \sim p(\theta|D)}[p(y|x,\theta)]\approx \frac{1}{N}\sum_{i=1}^N p(y|x,\theta_i)
102 | $$
103 | 
104 | 可以认为，这还是合理的，因为每一个 $\theta_i$ 都是训练出来的，因此都是从 $p(\theta|D)$ 中取样得到的。而得到这些 $\theta_i$ 也是相对容易的：我们只需要训练出来多个模型就可以了。
105 | 
106 | > Q：我们从同样的数据集 $D$ 使用同样的方法训练多个模型，为什么结果会不一样呢？
107 | >
108 | > A: 当然不是。首先，我们的optimizer（比如，SGD）本身就具有一定的随机性。其次，我们每一次训练模型之前，模型里的参数也都是随机初始化的。最后，就算没有随机性，也有一些处理办法。比如，在古老的时候，人们先搞一个数据集 $D$ ，然后每一个训练 $\theta_i$ 的 $D_i$ 都是从 $D$ 中随机（可重复）取样得到的。当然，现在我们早已不采用这样的方法，因为SGD的随机性已经足够了。
109 | 
110 | 实验中，一般取 $N=10$ 。你可能又要抱怨：这样方差也太大了吧。但实际上它work！
111 | 
112 | ### Exploiting Uncertainty
113 | 
114 | 我们现在已经有了一个方法来估计模型的不确定性。接下来，我们要考虑如何利用这个不确定性来改进我们的planner。
115 | 
116 | 我们首先注意到，假设我们已经有了 $p(\theta|D)$ 或者其近似，那么计算目标函数 $J$ 对 $\theta$ 的**平均值**就足够了。因为，之前指出的问题全部来自于从 $p(\theta|D)$ 取样出某一个 $\theta$ 造成的偏见，这已经被这一方法消去了。（在之前悬崖的例子里，我们如果把多个模型得到的不同边界线平均一下，很可能就会得到一个比较精确的边界线）
117 | 
118 | 因此，我们从现在开始只需要考虑如何计算 $J$ 的平均值。原来，我们的planner的目标是
119 | 
120 | $$
121 | J=\sum_{t=0}^T r(s_t,a_t), \quad \text{s.t. } s_{t+1}=f(s_t,a_t)
122 | $$
123 | 
124 | 但现在相当于决定性的 $f$ 不再存在，而是变成了
125 | 
126 | $$
127 | p(s_{t+1}|s_t,a_t)=\mathbb{E}_{\theta\sim p(\theta|D)}[p(s_{t+1}|s_t,a_t;\theta)]
128 | $$
129 | 
130 | 直观地想，给定 action sequence $a_1,a_2,\cdots$, 我们计算 $J$ 的方法应该是
131 | 
132 | 1. 重复多次：
133 |     1. 对 $t=1,2,\cdots,T-1$ :
134 |         1. 从ensemble（或者更高级地，从 $p(\theta|D)$ ）随机采样一个 $\theta$ ；
135 |         2. 用这个 $\theta$ 计算 $s_{t+1}$ ；
136 |     2. 计算 $J$ 。
137 | 2. 计算各个得到的 $J$ 的平均值。
138 | 
139 | 但实际上，我们**并不这样做**，而是采用以下的方法：
140 | 
141 | > **Objective with Ensemble**
142 | 
143 | 1. 对每一个ensemble中的模型 $\theta$ ：
144 |     1. 对 $t=1,2,\cdots,T-1$ ，**不断用这一个** $\theta$ 计算 $s_{t+1}$ ；
145 |     2. 计算 $J$ 。
146 | 2. 计算各个得到的 $J$ 的平均值。
147 | 
148 | 乍一看，这个方法完全不对——根据前面的理论，每个timestamp $t$ 对应的的 $\theta$ 必须是独立取样的，因为原先的
149 | 
150 | $$
151 | r(s_1,a_1)+\mathbb{E}_{s_2\sim p(s_2|s_1,a_1)}\left[r(s_2,a_2)+\mathbb{E}_{s_3\sim p(s_3|s_2,a_2)}[r(s_3,a_3)+\cdots]\right]
152 | $$
153 | 
154 | 应该被展开为
155 | 
156 | $$
157 | r(s_1,a_1)+\mathbb{E}_{\theta_1\sim p(\theta|D)}\mathbb{E}_{s_2\sim p(s_2|s_1,a_1;\theta_1)}\left[r(s_2,a_2)+\mathbb{E}_{\theta_2\sim p(\theta|D)}\mathbb{E}_{s_3\sim p(s_3|s_2,a_2;\theta_2)}[r(s_3,a_3)+\cdots]\right]
158 | $$
159 | 
160 | 而按照现在的这个算法，不同时间的 $\theta$ 完全是一样的。但实际上，这个方法work，而且表现的比我们最直观的思想给出的方法更好一点（可以参见hw4中的[实验数据](../../homework_repo/hw4/report.md)）！
161 | 
162 | > Q: 为什么会这样？
163 | > 
164 | > A: 因为我们的模型可能会学会某种**相关性**。直观上讲，我们原先的“直觉的”方法相当于在每一步做平均；而现在的方法相当于让ensemble中的 $N$ 个专家分别独立计算，然后做某种“voting”。
165 | >
166 | > 这样，如果这 $N$ 个模型都是专家，那么后者显然更好——假设现在我们请一群编程专家一起写代码。前一种方法相当于每一次轮流选取一个专家写一行代码，这样很可能大家什么都做不出来；而后面一种方法则是相当于让每一份专家独立写一份代码，最后把这些代码的运行结果进行投票。这样成功的可能性就会更高。
167 | >
168 | > 本质上，每个单独的专家预测的 $s_t$ 会有自己独特的 bias。假如让所有专家平均决策，就会产生单个专家没有见过的另外一种 bias，从而降低预测准确度。
169 | 
170 | # When Not Fully-observable
171 | 
172 | 我们最后来讨论一个topic：当环境不是完全可观测的时候，我们该怎么办？之前，我们总是没有区分state（可以完全描述整个系统）和observation（由state唯一决定，但不一定可以完全描述整个系统）。现在，就让我们来考虑一下这个问题。
173 | 
174 | 如果只有observation，会产生很多困难。比如说，如果observation是图片，那么它维度高，信息量却很低（比如，Inverted Pendulum环境中state只有4维，但画成图片却可以有几千维度）。此外，observation也不直接关联到dynamic，因此很难在observation上直接建模。
175 | 
176 | 处理这一问题有两种思路。第一种思路是我们见招拆招，除了动力学模型之外，我们再学习一个**Observation Model** $p(o_t|s_t)$ 。然后，我们就需要训练
177 | 
178 | $$
179 | \max_\phi \sum_t \mathbb{E}_{s_t,s_{t+1}\sim p(s_t,s_{t+1})}\left[\log p_\phi(s_{t+1}|s_t,a_t)+\log p_\phi(o_t|s_t)\right]
180 | $$
181 | 
182 | 但现在问题在于，我们不知道 $s_t$ ，也不知道 $s_t,s_{t+1}$ 应该从哪个分布取样（这个分布应该是state的分布，但我们并不知道这个分布）。
183 | 
184 | 这时，我们突然想到这一场景很类似：在DL中我们学习过Latent Variable Model。这里的**state就像latent variable**：latent variable是对高维度低信息量图片的一个“压缩”；我们不知道latent variable的分布，但可以对其作出假设；我们不知道latent variable对于每一个输入具体是多少，因此我们需要一个**encoder** $q_\psi(s_t|o_t)$ 。这样，我们就可以给出新的目标：
185 | 
186 | $$
187 | \max_{\phi,\psi} \sum_t \mathbb{E}_{o_t\sim p(o_t),o_{t+1}\sim p(o_{t+1})}\mathbb{E}_{s_t\sim q_\psi(s_t|o_t),s_{t+1}\sim q_\psi(s_{t+1}|o_{t+1})}\left[\log p_\phi(s_{t+1}|s_t,a_t)+\log p_\phi(o_t|s_t)\right]
188 | $$
189 | 
190 | 但是这里实际上有点tricky：为什么我们敢假设 $s_t$ 只依赖于 $o_t$ 的信息呢？实际上，很可能完全不是这样的：一般来说 $s_t$ 依赖于 $o_1,\cdots,o_{t-1},a_1,\cdots,a_{t-1}$ 都是可能的。因此，我们必须根据对于环境的基本假设给出encoder的形式。
191 | 
192 | > 比如说之前说到的Inverted Pendulum环境，我们的observation是一个图片。这个图片只包含角位置，但不包含角速度。而对于action的依赖性，我们举个例子：假设在某游戏里可以选择“开挂”这一action。操作后没有任何图片上的变化，只是攻击力变成原来的两倍。这样，就算给出所有observation也无法获得这一信息。
193 | 
194 | 当然，我们还有第二种方法。这种方法就是——我们直接不管三七二十一，就学
195 | 
196 | $$
197 | p(o_{t+1}|o_t,a_t)
198 | $$
199 | 
200 | 这样算法上是爽了，但相对应的就需要神经网络处理 $o_t$ 的水平很高。比如，可能给定 $o_t$ 之后进行多层卷积，还可能需要attention。这一方法也有成功的案例，但我们就不做过多介绍。
201 | 
202 | # Reference Papers
203 | 
204 | 1. [Neural Network Dynamics for Model-Based Deep Reinforcement Learning with Model-Free Fine-Tuning](https://arxiv.org/abs/1708.02596.pdf)（介绍model-based和model-free的结合）
205 | 2. [Deep Reinforcement Learning in a Handful of Trials using Probabilistic Dynamics Models](https://arxiv.org/abs/1805.12114.pdf)（非常sample-efficient的model-based RL方法）
206 | 3. [Sample-Efficient Reinforcement Learning with Stochastic Ensemble Value Expansion](https://arxiv.org/abs/1807.01675.pdf)（介绍了model-based RL中的ensemble方法）
207 | 4. [SOLAR: Deep Structured Representations for Model-Based Reinforcement Learning](https://arxiv.org/abs/1808.09105.pdf)（介绍了 partially observed model-based RL中的latent variable方法）


--------------------------------------------------------------------------------
/lecture/notes-zh/13-exploration_1.md:
--------------------------------------------------------------------------------
  1 | # Exploration: Why we need that?
  2 | 
  3 | 在前面的几讲中，我们已经给出了很多常见的RL算法。实验上也发现，这些算法在许多简单的任务和环境上面表现都已经十分优秀。但是，当我们把这些算法应用到一些复杂的任务上时，我们会发现它们的表现并不尽如人意。
  4 | 
  5 | 比如考虑一个简单的例子：在一个游戏中，开始你和一群小兵战斗，但当打死这些小兵之后，你会遇到一个很强大的boss。而如果你被小兵击败，你就会重新复活，回到游戏起点。对于我们的模型来说，其最好的策略可能和我们想象的不太一样——如果打败了小兵，遇到boss，模型会发现很难获胜，因此reward很小；但是如果总是被小兵打死，那么总是可以复活，从而可以不断积累这个比较小的reward。这样的话，模型可能会选择总是被小兵打死，而不去挑战boss。换句话说，我们的模型总是喜欢“摆烂”，不思进取。而为什么我们自己在玩这种游戏的时候不会轻易“摆烂”呢？其实是因为我们清楚这个游戏中各种规则的实际含义（“what these sprites means”）
  6 | 
  7 | 那么，如何避免我们的模型出现这种情况呢？我们需要让模型去探索环境。比如，模型也许通过某种巧合探索到了击败boss之后会有巨大的reward，它就会学会和boss战斗的技巧。这样，前面的问题就在理论上可以被解决。
  8 | 
  9 | 但exploration和exploitation（“剥削”，也就是“不思进取”，只在已有的知识上最大化reward）的平衡是困难的。可以像像，如果过分地explore，很可能陷入混乱；而只exploit更容易陷入局部最优。在理论上，在某些简化的情况下，可以推导出最优的exploration策略；但实际上对于我们这些复杂的任务，这些了理论方法往往太复杂以至于无法实现。这就是为何我们要研究exploration。
 10 | 
 11 | 你可能还记得之前我们在介绍Q learning中提到的exploration 策略（比如 $\epsilon$ -greedy和Boltzmann）。但那些只是最简单的exploration策略。我们这一讲就来介绍一些理论背景更强硬的，更复杂的exploration策略。
 12 | 
 13 | # Toy Model: Multi-arm Bandit
 14 | 
 15 | “多手臂土匪”模型是一个相对简单的，用来在理论上分析exploration策略的模型。在这一模型中，土匪可以采用 $n$ 中不同的action：
 16 | 
 17 | $$
 18 | \mathcal{A}= \{a_1,a_2,\cdots,a_n\}
 19 | $$
 20 | 
 21 | 给定其中的一个action，土匪会立刻获得一个reward（因此这一模型被称为stateless的）。这一reward是随机的，服从某种未知分布
 22 | 
 23 | $$
 24 | r_{i}\sim p(\cdot |a_i)
 25 | $$
 26 | 
 27 | 我们进一步认为，土匪的 $n$ 种操作具有某种共性，因此我们可以找到 $n$ 个参数 $\theta_1,\cdots,\theta_n$ ，使得
 28 | 
 29 | $$
 30 | p(r|a_i)=p_{\theta_i}(r)
 31 | $$
 32 | 
 33 | 其中 $p_{\theta_i}$ 是一个probability density model。
 34 | 
 35 | > 一个简单的例子： $p(r|a_i)=\text{Bernulli}(\theta_i)$ 。当然，接下来的结果对于更一般的情形也是成立的。
 36 | 
 37 | 这一表述也可以理解为一种**POMDP**(Partially Observable Markov Decision Process)：我们的observation是这里的reward，而其背后的state是 $\theta_i$ 。一个概念叫做**belief state**，它指的是根据我们目前的所有observation情况下推测的概率分布 $\hat{p}(\theta_1,\cdots,\theta_n)$ （注意 $\hat{p}$ 和 $p$ 没有任何关系）。
 38 | 
 39 | 我们的目标是，最大化
 40 | 
 41 | $$
 42 | R=\sum_{t=1}^T r(a_t)
 43 | $$
 44 | 
 45 | 注意这个过程包含了我们试错的过程。等价地，我们也可以把这一目标写为**regret**的形式：
 46 | 
 47 | $$
 48 | \text{Reg}=T\cdot \mathbb{E}_{r\sim p(r|a^\star)}[r]-\sum_{t=1}^T r(a_t)
 49 | $$
 50 | 
 51 | 它代表了我们的模型和最优模型之间的差距。接下来，我们介绍三种方法，来最小化这一regret。
 52 | 
 53 | ## Method 1: Optimistic Exploration/UCB(Upper Confidence Bound)
 54 | 
 55 | 这一方法有点像“model free”，它不对reward产生的分布做任何假设。它的思想是，对于纯粹exploit的策略，选择action的方法是
 56 | 
 57 | $$
 58 | a=\arg\max_{a_i}\hat{\mu}_{a_i}
 59 | $$
 60 | 
 61 | 其中 $\hat{\mu}_{a_i}$ 代表我们目前使用 $a_i$ 得到的所有reward的平均值。为了把explore加入，我们注意到 $\hat{\mu}_{a_i}$ 存在误差。其实际值估计应该是
 62 | 
 63 | $$
 64 | \hat{\mu}_{a_i}\pm C\hat{\sigma}_{a_i}
 65 | $$
 66 | 
 67 | 这种方法的思想是，我们最optimistic，因此取
 68 | 
 69 | $$
 70 | a=\arg\max_{a_i}[\hat{\mu}_{a_i}+B(a_i)]
 71 | $$
 72 | 
 73 | 其中 $B(a_i)$ 代表给action $a_i$ 加的bonus。理论上可以证明，如果取
 74 | 
 75 | $$
 76 | B(a_i)=\sqrt{\frac{2\log T}{N(a_i)}}
 77 | $$
 78 | 
 79 | 其中 $N(a_i)$ 代表我们选择了 $a_i$ 的次数，那么我们的regret会以 $\log T$ 的速度增长，这是理论最优的。
 80 | 
 81 | ## Method 2: Thompson Sampling/Posterior Sampling
 82 | 
 83 | Thompson sampling方法稍微加入了一些model based的成分：我们来根据现在的知识预测可能的 $\theta_i$ 值的分布 $\hat{p}(\theta_1,\cdots,\theta_n)$ （就是之前所说的belief state）。接下来，我们从中随机采样一组 $(\theta_1,\cdots,\theta_n)$ ，然后基于这一组参数作出最优的决策。利用这一决策带来的数据，我们可以重新update我们的belief model，从而不断重复。
 84 | 
 85 | 显然，这一方法在理论上难以分析。但实际上，它跑的很好。
 86 | 
 87 | ## Method 3: Information Gain
 88 | 
 89 | 这一方法的思想是：我们要平衡exploitation和exploration。对于exploitation我们已经可以给出一个定量的刻画——reward；但我们并不好说怎么的方法是一个explore。它想到，explore的程度可以用**获得信息的量**来刻画。根据信息理论，定义information gain（也叫做**互信息**）
 90 | 
 91 | $$
 92 | \text{IG}(a)=\mathcal{H}(\hat{p}({\theta}|h))-\mathbb{E}_{r\sim p(\cdot|a)}\left[\mathcal{H}(\hat{p}(\theta|h,a,r))\right]
 93 | $$
 94 | 
 95 | 其中 $\mathcal{H}$ 代表分布的熵，而 $h$ 代表所有的历史， $\hat{p}(\theta|h)$ 和 $\hat{p}(\theta|h,a,r)$ 代表根据历史（后者比前者多一组数据）训练出的belief state分布。这一表达式的代表了选择action $a$ 之后增加的信息量（减少的不确定性）。
 96 | 
 97 | 在选择 action 的时候，要做到 exploitation 和 exploration 的平衡：前者体现在和每种 latent $\theta$ 的 optimal policy $a^\star_\theta$ 的差距
 98 | $$\mathbb{E}_{\theta\sim p(\theta|h)} [r(a^\star_\theta)-r(a)],$$
 99 | 而后者体现在互信息 $\text{IG}(a)$。于是我们可以让
100 | 
101 | $$
102 | a=\arg\min_a \frac{(\mathbb{E}_{\theta\sim p(\theta|h)} [r(a^\star_\theta)-r(a)])^2}{\text{IG}(a)}.
103 | $$
104 | 
105 | 接下来，让我们把这些方法从multi-arm bandit的模型推广到更一般的RL问题中。
106 | 
107 | # Exploration in Deep RL
108 | 
109 | ## UCB in Deep RL
110 | 
111 | 一个很直接的方法是，我们直接把bonus加在reward function上面：
112 | 
113 | $$
114 | r^{+}(s,a)=r(s,a)+B(s)=r(s,a)+C\sqrt{\frac{\log T}{N(s)}}
115 | $$
116 | 
117 | 然后，我们把这一reward代入任何之前的算法，然后tune一下这个超参数 $C$ 就可以了。也有其他的bonus的形式（上面这种叫做UCB），比如MBIE-EB：
118 | 
119 | $$
120 | B_{\text{MBIE-EB}}(s)=\sqrt{\frac{1}{N(s)}}
121 | $$
122 | 
123 | 和BEB：
124 | 
125 | $$
126 | B_{\text{BEB}}(s)=\frac{1}{N(s)}
127 | $$
128 | 
129 | 但是这里有一个小细节： $N(s)$ 怎么算？我们不能直接数，否则（比如说）对于连续的state，count一定是1。不仅如此，即使对于离散的state，直接数数意义也不大（比如说，假设游戏的图形界面上显示了游戏时间，那么同样的情况可能对应的state就大不相同了）。因此，可以想到，我们需要从 $s$ 中拿出某种特征，然后根据这些特征来计数。接下来就介绍若干计数的方法。
130 | 
131 | ### Counting Method 1: Fitting Generative Model
132 | 
133 | 这一方法的思想是，我们用一个generative model学习 $p_\phi(s)$ ，代表**按照我们见过的所有数据的数据集训练**，state $s$ 出现的概率。
134 | 
135 | 假设有了这样一个model，那么理想情况下，对于一个特定的state $s$ ，有
136 | 
137 | $$
138 | p_\phi(s)\approx \frac{N(s)}{N}
139 | $$
140 | 
141 | （其中 $N$ 代表我们见过的所有state的数量），而如果在此基础上再见一次 $s$ ，那么模型更新之后就会近似有
142 | 
143 | $$
144 | p_{\phi'}(s)\approx \frac{N(s)+1}{N+1}
145 | $$
146 | 
147 | 这样，我们可以近似地给出
148 | 
149 | $$
150 | N(s)\approx p_{\phi}(s)\cdot \frac{1-p_{\phi'}(s)}{p_{\phi'}(s)-p_\phi(s)}
151 | $$
152 | 
153 | 这一方法还有一个细节：我们应该怎样选取这个"generative model" $p_{\phi}(s)$ 呢？注意这个模型的目的和普通的generative model不一样，它不是为了生成，而是为了计算概率。因此，GAN之类生成效果最吊打但没有概率的的模型并不适用。实际上采用的是一种奇怪的模型，类似于pixel CNN，叫做"CTS"。具体细节可以参考[这篇论文](https://arxiv.org/abs/1606.01868)，它也是Count-Based Exploration方法的开山之作。
154 | 
155 | ### Counting Method 2: Counting with Hash
156 | 
157 | 这一方法的思想是，我们创造一个hash，使得semantic相近的state被映射到接近的hash值。这样，我们就可以用hash值的计数来代替state的计数。
158 | 
159 | hash函数应该如何选取？一个自然的选取是一个autoencoder，使用它的latent space作为hash。而autoencoder也要随着数据的积累逐渐训练。这样的方法也有很好的效果。
160 | 
161 | ### Counting Method 3: Counting with Exemplar Models
162 | 
163 | 这一方法另辟蹊径，我们可以不采用generative model，而是discriminative model。具体地，我们训练一个classifier，判断某个state是否在历史的数据集中出现过。它的训练集是：
164 | 
165 | $$
166 | \mathcal{D}_{s}^{(+)}=\{ \text{all historical states} \},\quad \mathcal{D}_{s}^{(-)}=\{s\}
167 | $$
168 | 
169 | 其中 $s$ 代表现在我们要计数的state。然后，我们用这个classifier $D_s$ 来输出 $D_s(s)$ 。
170 | 
171 | 乍一看，这没道理——这应该总是输出label “ $-$ ” 啊！但是仔细想并不是这样：假设模型具有较差的表达能力（可以通过 regularization 控制），那么会有很多和 $s$ 类似的 historical states 被 encode 到几乎相同的 embedding。如果和 $s$ 类似的数据在 $\mathcal{D}_{s}^{(+)}$ 中出现过 $N(s)$ 次，那么大概会有
172 | 
173 | $$
174 | \Pr(D_s(s)=+)\approx \frac{N(s)}{N(s)+1}
175 | $$
176 | 
177 | 因此我们就可以近似地估计
178 | 
179 | $$
180 | N(s)\approx \frac{\Pr(D_s(s)=+)}{1-\Pr(D_s(s)=+)}
181 | $$
182 | 
183 | 直观上， $s$ 这个state越是“稀有”，那么被判断为“ $-$ ”的概率越大。因此，这一方法也可以给出一个比较好的计数。但问题在于，为了计算每一个state的N，我们都需要重新训练一个模型，这是非常耗时的。
184 | 
185 | 解决办法类似于VAE中的Amortize方法。我们不是对于每一个 $s$ 训练一个网络 $D_s$ ，而是训练一个 $D(\cdot,s)$ ，而计算的从 $D_s(s)$ 变成 $D(s,s)$ 。这一方法在实验上也取得了很好的结果。
186 | 
187 | ### Counting Method 4: Heuristic estimation using errors
188 | 
189 | 还记得在DL中，我们说模型的generalization 问题：测试的数据和原始的数据集分布差距越大，模型的误差就越差。而我们现在刚好就是想判断一个state是否和历史上的数据相似。因此，我们可以用模型的误差来估计这一点。
190 | 
191 | 具体地，我们随便找一个比较feature的函数（也就是说它不能太简单） $f$ ，然后在见过的数据上面拟合一个模型 $f_\theta$ 。这样，我们只需要计算 $f_\theta$ 和 $f$ 的误差（比如mse loss），就可以估计这个state是否在历史上出现过。误差越大，说明这个数据与训练数据集分布差距越大，从而奖励越高。这一方法也被称为**RND(Random Network Distillation)**。
192 | 
193 | 如何选取这样的 $f$？有时候，人们就让它是 next state prediction，也就是说 $f(s,a)=s'$。还有一种巧妙的方式：我们根据模型架构**随机**选取一个参数 $\phi$，并让 $f=f_{\phi}$。不过，具体细节肯定还是要在实验上探讨。
194 | 
195 | ## Thompson Sampling in Deep RL
196 | 
197 | 我们回顾，thompson sampling的方法是说，我们根据现在已有的知识通过模型预测出环境的一些隐藏参数满足的概率分布，然后从中随机采样一组参数，作出决策，再用新的数据重新训练模型。
198 | 
199 | 在multi-arm bandit的模型中，我们预测的就是一组参数 $\theta_1,\cdots,\theta_n$ ，因为我们的目标（也就是reward）完全由它们决定；而怎样将它拓展到一般的情况呢？我们想到，我们应该预测一个Q function。
200 | 
201 | 这样，我们就有一个算法：
202 | 
203 | > Thompson Sampling
204 | 
205 | 重复：
206 | 
207 | 1. 从当前的模型 $p_\phi(Q)$ 采样一个 $Q$ ；
208 | 2. 利用 $Q$ 进行最优决策，跑一个rollout；
209 | 3. 用这个rollout的数据更新 $p_\phi(Q)$ 。
210 | 
211 | 我们如何构造一个函数 $Q$ 的分布呢？最自然的方法就是还是采用ensemble（或者叫作**Bootstrapped**）方法，我们训练 $N$ 个网络 $Q_1,\cdots,Q_N$ ，然后随机从中采样一个。更进一步，我们可以保持前面的提取特征的网络不变，只是加上 $N$ 个 projection head。这样，我们就可以在不增加太多参数的情况下实现ensemble。
212 | 
213 | 你可能会感到奇怪，这个方法看起来就是在Q learning的基础上增加了一个ensemble。但这实际上是关键的：还记得原先的 $\epsilon$ -greedy策略，它的exploration通常是乏力的，因为每一步的explore都相当于是随机游走。但现在我们的explore相当于是更加强大的。
214 | 
215 | > 举个例子：比如一个游戏里，需要连击以获得巨大的奖励。如果每一步随机按照某个方式explore，那么很难找到这个连击的方法。但很有可能，我们的ensemble中的某一个网络刚好学会了这个连击的方法，按照它的Q function来决策，我们就有可能发现这种方式。
216 | 
217 | ## Information Gain in Deep RL
218 | 
219 | 和前一种方法一样，我们也需要想清楚，原先的multi-arm bandit中预测的 $\hat{p}(\theta)$ 现在应该变成什么。（也就是说，计算什么的information gain）。我们有几个选择：
220 | 
221 | - 计算reward $r(s,a)$ 的information gain，也就是构造一个模型学习 $r_\theta(s,a)$ ，然后计算对 $\theta$ 的熵；
222 | - 计算state density $p(s)$ 的information gain
223 | - 计算dynamic $p(s'|s,a)$ 的information gain
224 | 
225 | 具体的实现中可以采用不同的目标，因为我们的目标函数并不重要，重要的是它反映获得这个state $s$ 之后我们增加了多少信息（这一点体现在 $\theta$ 上）。
226 | 
227 | 我们接下来不管选取哪种目标，而是考虑如何改写之前multi-arm bandit那里的information gain。我们有
228 | 
229 | $$
230 | \text{IG}(a)=\mathbb{E}_{p((s,a,s')|a)}\left[\mathcal{H}({p}({\theta}|h))-\mathcal{H}({p}(\theta|h,s,a,s'))\right]
231 | $$
232 | 
233 | 可以证明（见下一讲），
234 | 
235 | $$
236 | \text{IG}(a)=\mathbb{E}_{p((s,a,s')|a)}\left[\text{KL}(p(\theta|h,s,a,s')||p(\theta|h))\right]
237 | $$
238 | 
239 | 而为了方便，我们可以使用单采样进行近似：
240 | 
241 | $$
242 | \text{IG}(a)\approx \text{KL}(p(\theta|h,s,a,s')||p(\theta|h))
243 | $$
244 | 
245 | 这里的 $h$ 代表所有的历史数据。但是这里的 $\theta$ 可能很复杂，甚至可能是神经网络的参数。因此，我们需要再次使用Bayesian方法，训练的不是一个确定的网络，而是一个分布。而这个分布就是把每一个参数变成从一个高斯分布取样，均值和方差都是另外的可训练参数，记为 $\phi$ 。换句话说，贝叶斯网络的目标是
246 | 
247 | $$
248 | p(\theta|h)\approx q(\theta|\phi)
249 | $$
250 | 
251 | > 注意品味这个表达式的意思：左边代表，把历史作为训练集，训练出来可能的网络参数 $\theta$ 的分布；而右边代表，我们拿历史作为训练集，用一种特殊的方法训练出一个“贝叶斯神经网络”，这个网络的参数 $\phi$ 用来预测原先的网络 $\theta$ ，因此给出了一个人为的、近似的分布。
252 | 
253 | 一般来说，贝叶斯网络采取独立假设，即认为每个参数的分布是独立高斯分布
254 | 
255 | $$
256 | q(\theta|\phi)=\prod_i \mathcal{N}(\theta_i|\mu_{\phi,i},\sigma_{\phi,i})
257 | $$
258 | 
259 | 因此，只有在左边真实的分布也是这样的形式时，才能做的比较好。不过实际上，贝叶斯网络的效果还是很好的。
260 | 
261 | 为了训练这样的一个贝叶斯网络，我们最小化
262 | 
263 | $$
264 | \text{KL}(q(\theta|\phi)||p(\theta|h))=\text{KL}\left(q(\theta|\phi)\Bigg|\Bigg|p(h|\theta)\frac{p(\theta)}{p(h)}\right)
265 | $$
266 | 
267 | 一般认为 $p(\theta)$ 取自isotropic Gaussian，而 $p(h)$ 作为和 $\theta$ 无关的常数，在forward KL的计算中可以当作全局常数而被去除。
268 | 
269 | 有了这样的工具，我们就可以成功地给出一个exploration bonus的不错的近似：
270 | 
271 | $$
272 | \text{IG}(a)\approx \text{KL}(q(\theta|\phi')||q(\theta|\phi))
273 | $$
274 | 
275 | 其中 $\phi'$ 代表在加入 $(s,a,s')$ 这一组数据之后的新的参数。
276 | 
277 | 我们可以考虑使用前面 Multi-arm bandit 的方法选择 $a$；但是考虑到此时的 latent $\theta$ 十分复杂，所以计算出每种情况的 $a^\star$ 也十分困难。
278 | 
279 | 但我们只需要把 $\text{IG}(a)$ 这个 bonus 加到 reward 里，就能自然地促进模型的 exploration: 
280 | $$r'\leftarrow r+\alpha\cdot \text{IG}(a)\approx r+\alpha\cdot \text{KL}(q(\theta|\phi')||q(\theta|\phi)).$$
281 | 用 $r'$ 学习 policy $\pi$ 可以使用任何 RL method。
282 | 
283 | 顺带一提，训练 Bayesian network 的 loss 是
284 | $$\mathcal{L}_{\text{Bayesian}}=\text{KL}(q(\theta|\phi)||p(\theta))-\mathbb{E}_{\theta\sim q(\cdot|\phi)}(\log p(\mathcal{D}|\theta)),$$
285 | 其中 $\mathcal{D}$ 是 replay buffer 中随机 sample 的数据。第一项希望 $q(\theta|\phi)$ 尽量靠近 prior distribution；第二项则是要求 $\theta$ 正确地描述这个 environment。
286 | 
287 | 上面的算法也被称为VIME（Variational Information Maximization Exploration）。它在数学上十分强大，但需要的算力也很巨大，因为每一步都需要训练一个贝叶斯网络。
288 | 
289 | ## Summary
290 | 
291 | 除了上面的三种方法之外，还有一些其他的方法。出于篇幅的考虑，我们就不一一介绍了。
292 | 
293 | 可以看到，无论是前面的哪一种方法，为了计算用于exploration 的bonus，都需要在**每一步**训练一个新的模型。因此，我们也可以看到，为了解决exploration这个困难的问题，我们必定是要付出很大的代价的。
294 | 
295 | # Reference Papers
296 | 
297 | 1. [Exploration by Random Network Distillation](https://arxiv.org/abs/1810.12894)（RND）
298 | 2. [A Possibility for Implementing Curiosity and Boredom in Model-Building Neural Controllers](https://ieeexplore.ieee.org/document/6294131)
299 | 3. [Incentivizing Exploration in Reinforcement Learning with Deep Predictive Models](https://arxiv.org/abs/1507.00814)
300 | 4. [Deep Exploration via Bootstrapped DQN](https://arxiv.org/abs/1602.04621)（Bootstrapped Method）
301 | 5. [VIME: Variational Information Maximizing Exploration](https://arxiv.org/abs/1605.09674)（VIME）
302 | 6. [Unifying Count-Based Exploration and Intrinsic Motivation](https://arxiv.org/abs/1606.01868)（CTS）
303 | 7. [\#Exploration: A Study of Count-Based Exploration for Deep Reinforcement Learning](https://arxiv.org/abs/1611.04717)
304 | 8. [EX2: Exploration with Exemplar Models for Deep Reinforcement Learning](https://arxiv.org/abs/1703.01260)（Counting with Exemplar Models）


--------------------------------------------------------------------------------
/lecture/notes-zh/14-exploration_2.md:
--------------------------------------------------------------------------------
  1 | # Exploration without Rewards
  2 | 
  3 | 如果总结前面一讲介绍的exploration方法，我们会发现，我们基本上是在原先的方法上面作出一个修正，依然目标是单一的，但只不过增加一些explore的成分。而现在，我们考虑一个新的视角：我们试着通过unsupervised的方法，在**不给任何reward**的情况下，让模型自己进行“探索”。
  4 | 
  5 | 直观上，这是更容易成功的，因为在没有目标的情况下，探索是一个更加自然的行为。（想想刚会爬行的婴儿，在没有任何人向他提出任何要求的时候，也会自己探索周围的环境。）
  6 | 
  7 | 一个具体的例子：假设我们把机器人扔到一个环境里，这一环境可能有很多东西，比如有一个可以拉开的柜门，有一个可以按下的按钮，等等。在训练的过程中，我们不给机器人任何reward，也不告诉它怎么做，只是让它自己去探索。但最后，我们给出一个对目标(goal state)的描述，然后机器人需要完成这一目标。
  8 | 
  9 | 我们这一讲就来研究如何完成这一任务。
 10 | 
 11 | ## Some Information Theory: Empowerment
 12 | 
 13 | Empowerment 被定义为
 14 | $$\mathcal{I}(s_{t+1};a_t)=\mathcal{H}(s_{t+1})-\mathcal{H}(s_{t+1}|a_t).$$
 15 | 这个函数可以用来作为 exploration 的 objective。直观上理解，第一项凸显了 **diversity**: 在现在的策略下，$s_{t+1}$ 有足够多的可能性；而第二项则体现了 **authority**: 我们想要最小化 $\mathcal{H}(s_{t+1}|a_t)$，也就是说在给出操作 $a_t$ 时，$s_{t+1}$ 能被我们预知到。综合两项，我们得到了一个 objective，既能够促进探索的多样性，也能确保我们做出的操作能够更加 deterministic 地决定下一步，而不会做出无法预测结果的行为。
 16 | 
 17 | ## Imagining Goals
 18 | 
 19 | 怎样在没有reward的时候也完成训练呢？Imagining Goals 方法引入一个 VAE：对于观察到的 state $s$ 通过 encoder $q_\phi$ 到达 latent variable $z$ ，而 $z$ 通过 decoder $p_\theta$ 重建 state $s'$。这样，我们可以通过 $p(z)$ 随机取样一个新的任务 $z_g$，重建成为新的目标 $x_g$ 供 policy model $\pi(a|x,x_g)$ 训练。
 20 | 
 21 | 在这样的构造下，大致流程如下：
 22 | 
 23 | > **Imagining Goals**
 24 | 
 25 | 1. 在latent space随机采样作为目标： $z_g\sim p(z)$ ，然后通过decoder生成目标state： $x_g\sim p_\theta(x_g|z_g)$ ；
 26 | 2. 按照现有的policy $\pi(a|x,x_g)$ 来收集rollout，得到最终的state $\bar{x}$ （最终理想上，我们希望 $\bar{x}=x_g$ ）；
 27 | 3. 利用现在的数据训练 $\pi$；
 28 | 4. 把 $\bar{x}$ 加入数据集来训练VAE。
 29 | 
 30 | 但这一方法有一定的问题。具体来说，因为 VAE 的训练数据是历史上出现过的所有 state，所以从 latent space 中任意取出 $z$ ，训练出来的大概率也和见过的 state 差别不大。解决方案很简单：我们把当前已经探索过，但概率较小的 state 的权重放大。具体来说，对 VAE 的 reconstruction loss 中的 MLE loss 做出如下修改：
 31 | 
 32 | $$
 33 | \log p_\theta(\bar{x})\to p_\theta(\bar{x})^\alpha \cdot \log p_\theta(\bar{x})
 34 | $$
 35 | 
 36 | 其中 $\alpha \in (-1,0)$（因为要放大较小 $p_\theta(x)$ 的贡献，所以 $\alpha<0$；但还需要维持概率的序关系，所以 $\alpha>-1$。当 $\alpha\to -1$ 时，数学上近似于任何点的均匀随机取样）。这一方法被称为 **skew-fit**。
 37 | 
 38 | 事实上，可以证明，这样的更新会使得 $\mathcal{H}(p_\theta(x))$ 增加，所以最终会让 $p_\theta$ 趋于均匀分布。
 39 | 
 40 | > 你也可能会奇怪：我们生成这样的一个均匀分布有什么意义呢？这里必须注意到，这个均匀分布是在所有**valid state**上面的均匀分布！换句话说，如果state以一个图像的形式呈现，那么这个分布并不是对于每一个pixel的均匀分布，而是整体一切在该环境内有意义的图像的均匀分布。
 41 | 
 42 | ### Objective?
 43 | 
 44 | 我们能否给出这一流程等效的objective呢？我们刚才知道（虽然并没有在这里证明）训练VAE的过程相当于是最大化分布的熵
 45 | 
 46 | $$
 47 | \mathcal{H}_{g\in G}(p_{\text{VAE}}(g)).
 48 | $$
 49 | 
 50 | 但与此同时我们也在训练一个 policy $\pi$，使得最终的结果 $\bar x$ 和真正目标 $g$ 尽可能接近。如果我们用 $\tilde{p}(g|\bar{x})$ 表示已知在我们的模型最后停在 $\bar{x}$ 的时候，实际目标是 $g$ 的概率，那么我们的模型就是让这一概率分布的熵尽量小（比如，在最理想情况下，看到 $\bar{x}$ 就可以立刻给出 $g$ 是什么）。将两项整合在一起，整体的目标可以形式上写成
 51 | 
 52 | $$
 53 | J=\mathcal{H}(\tilde{p}(g))-\mathcal{H}(\tilde{p}(g|\bar{x}))=\mathcal{I}(g;\bar x).
 54 | $$
 55 | 
 56 | 这一形式实际上正是之前提到的**互信息**。这一思想十分关键，我们会看到之后的一些方法也可以写为对其它东西的互信息的形式。
 57 | 
 58 | > 互信息的形式有何种深意？我们可以发现，前一项的最大化说的恰是我们的模型要尽量多地探索；而后一项的最小化代表我们的模型“说到做到”，选取了准确的操作。这两者的平衡，正是我们在这一问题假设下所追求的。
 59 | 
 60 | ### Mutual Information Property
 61 | 
 62 | 在继续下面的内容之前，我们先来论证一个关于互信息的性质：
 63 | 
 64 | $$
 65 | \mathbb{E}_{y}\left[\mathcal{H}(p(x))-\mathcal{H}(p(x|y))\right]=\mathbb{E}_y[\text{KL}(p(x|y)||p(x))]
 66 | $$
 67 | 
 68 | 这一性质的证明比较容易，关键在于理解它的思想：获得的信息越大对应着两个分布的差距越大。证明可以直接展开来完成
 69 | 
 70 | $$
 71 | \text{LHS}=\sum_{x,y}p(y)\left(-p(x)\log p(x)+p(x|y)\log p(x|y)\right)
 72 | $$
 73 | 
 74 | $$
 75 | =\sum_{x,y}p(x,y)\log \frac{p(x|y)}{p(x)}=\sum_{x,y}p(y)\cdot \text{KL}(p(x|y)||p(x))
 76 | $$
 77 | 
 78 | $$
 79 | =\mathbb{E}_y[\text{KL}(p(x|y)||p(x))]=\text{RHS}
 80 | $$
 81 | 
 82 | ## State Marginal Matching (SMM)
 83 | 
 84 | 另外一个新的方法是，我们考虑一个更加一般的问题。我们的目标是最后state marginal是某个给定的 $p^\star(s)$ ，也就是
 85 | 
 86 | $$
 87 | p_\pi(s)\approx p^\star(s)
 88 | $$
 89 | 
 90 | 其中 $p_\pi(s)$ 代表按照当前的策略，在任何时候到达 $s$ 的概率。这样，只要把 $p^\star(s)$ 选取为均匀分布，就可以实现exploration；而如果稍微调整这一分布，就可以实现某种“定向”的exploration。
 91 | 
 92 | 那么如何实现这样复杂的操作呢？直观上，我们可以类比上一讲提到的count-based exploration的方法，给每一个state一个bonus：
 93 | 
 94 | $$
 95 | \tilde{r}(s)=\log p^{\star}(s)-\log p_\pi(s),
 96 | $$
 97 | 
 98 | 这样就有
 99 | 
100 | $$
101 | \mathbb{E}_{s\sim p_\pi(s)}[\tilde{r}(s)]=-\text{KL}(p_\pi(s)||p^\star(s)).
102 | $$
103 | 
104 | 因此，一个合适的训练可以使得 $p_\pi(s)$ 逼近 $p^\star(s)$ 。这一方法被称为“State Marginal Matching”。
105 | 
106 | 此外，需要注意现在的假设中，训练时我们得不到reward，因此总的“人造reward” $\tilde{r}(s)$ 中没有原来的reward项。同时，注意 $p_\pi(s)$ 这一分布并非显然，一般需要用一个模型来拟合。考虑了这些后，我们可以得到一个训练流程：不断重复
107 | 
108 | 1. 根据 $\tilde{r}(s)$ 来训练 $\pi$ ；
109 | 2. 根据 $\pi$ 获得的轨迹数据来update $p_\pi(s)$ 。
110 | 
111 | 但这一方法有一个比较隐秘的问题。具体的细节很复杂，可以参考[这里](https://arxiv.org/abs/1906.05274)。但我们可以给出一个比较直观的解释。
112 | 
113 | ![](./assets/14-1.jpeg)
114 | 
115 | 如图，假设我们的 $p^\star$ 是橙色的均匀分布，一开始的时候， $p_{\pi_1}$ 和 $\pi_1$ 在左下角的区域， $p_{\pi_1}$ 近似了 $\pi_1$ 的state marginal（图中的绿色），而蓝色代表某一条具体的轨迹。这样，根据reward的选择，我们就会特别倾向于走向一些没有被绿色覆盖的部分，比如运动到图示 $\pi_2$ 的区域。然后，1,3,4这三个区域就又没有被覆盖，因此我们可能到达它们中的任何一个。这样，我们发现，我们的策略可能一直在乱跳，而不是完全地覆盖。
116 | 
117 | 从另一个角度，第一步中训练 $\pi$ 的 $\tilde r$ 是和 $\pi$ 有关的，但它并没有和 $\pi$ 一起更新。也就是说，第一步的训练中，RL algorithm 并没有“意识到”自己的 objective 和自己有关！所以才会出现上面一段所说的 "tail-chasing" scenerio——对着一个固定的 $\tilde r_{\pi'}$ 训练 $\pi$，只会让 $\pi,\pi'$ 离的越来越远，而不会趋于平均。
118 | 
119 | 解决这个问题也很简单，只需要对历史状态取平均即可：
120 | 
121 | > **State Marginal Matching**
122 | 
123 | 1. 重复
124 |     1. 根据 $\tilde{r}(s)$ 来训练 $\pi$ ；
125 |     2. 根据**历史上所有的轨迹数据**来update $p_\pi(s)$ 。
126 | 2. **不返回最后的 $\pi$ ，而是返回历史上所有 $\pi$ 的平均**。
127 | 
128 | 在Game theory上证明了（**Nash 均衡**理论），这样可以保证这一方法收敛，并且到达state marginal matching的目标。实验上，这一方法确实可以在target state distribution均匀的时候做到非常均匀的explore。
129 | 
130 | 最后，我们来讨论一下SMM的数学上的含义。还记得它的objective是KL divergence的最小化。因此，对于我们希望模型均匀explore的情况，我们的objective就变成了熵：
131 | 
132 | $$
133 | \mathcal{H}(p_\pi(g))
134 | $$
135 | 
136 | 其中 $g$ 还是代表任何一个goal state。可以看到，这和之前Imagining Goals的思想还是类似的，我们还是限定一个所有可能的goal的集合，然后让模型的最终行为可以均匀地到达这一集合中的任何一个goal。
137 | 
138 | ## Theoretical Consideration: why maximize entropy?
139 | 
140 | 一个有意思的问题是，为什么我们前面给出的两种方法，都最后落实到了最大化熵上？实际上，我们即将指出，最大化熵是理论上最好的方案。
141 | 
142 | 我们想像这样一种情景：回到开始的例子，机器人被放入一个环境中，里面有很多技能可能需要练习。在训练的过程中，机器人并不知道最后的目标，也得不到任何reward。但最后，测试的时候我们选取**机器人表现最差的任务**进行测试。在这样的情况下，机器人最理想的训练过程中goal的分布 $p(g)$ （ $g\in G$ ）是什么呢？
143 | 
144 | 当然是 uniform distribution！在不知道 test task 的时候，探索所有的可能性是我们能做的最优的事！这就等价于最大化 entropy。
145 | 
146 | # Exploration by Skills
147 | 
148 | 除了让我们的模型探索不同goal之外，我们还可以让模型探索不同的skills。直观上，一个skill给出的是一系列action，完成一个小的步骤，因此，**skill的学习往往可以比goal的学习更具有普适性**。举个例子：假设现在要求机器人到达门口，但不碰到房间里的某个区域（比如那个区域有一个坑）。如果使用goal学习的方式，机器人就只知道goal state是门口，而不知道怎么避开坑。但使用skill学习的方式，我们可以先学习一个skill，让机器人避开坑，然后再学习一个skill，让机器人走到门口。
149 | 
150 | ![](./assets/14-2.png)
151 | 
152 | 具体地，不同的skill一般对应着state space中不同的区域，而它们最好能覆盖整个state space，如上面的图所示。如何实现这一点呢？我们选取我们的policy为 $\pi(a|s,z)$ ，其中 $z$ 代表某个skill。我们的目标则是最大化
153 | 
154 | $$
155 | J=\sum_z \mathbb{E}_{s\sim \pi(s|z)}[\log p_D(z|s)]
156 | $$
157 | 
158 | 其中 $p_D$ 代表某种discrimitive model，它也在训练，理想状况下它应该可以通过state就确定下来在哪一个skill上，因为我们提到不同的skill对应的是state space的不同区域。换句话说，上面的目标就是让我们的policy对于每一个skill都作出不同的action，到达不同的state。
159 | 
160 | 训练过程中，discrimitive model和policy都在训练；但和GAN不同，这里的两个模型并非对抗，而是相互辅助。就像下面的图那样，开始D可能只是随机画出一条分界线，但policy就会随着学会，最后自然分开不同skill对应的action。
161 | 
162 | ![](./assets/14-3.jpeg)
163 | 
164 | 实验上，这一方法得到的效果非常好玩：比如，在Cheetah环境中，有的skill使得机器人向前跑，有的使得机器人向后跑，有的使得机器人跳跃，等等。
165 | 
166 | 可以[证明](https://arxiv.org/abs/1802.06070)，上面的目标实际上在最大化互信息
167 | 
168 | $$
169 | I=\mathcal{H}(p(z))-\mathcal{H}(p(z|s))=\mathcal{I}(z;s).
170 | $$
171 | 
172 | ### Takeaway
173 | 
174 | 总结下来，通过最大化 **outcome**（$s$）和 **task**（可以是目标状态 $g$，或者是 skill $z$）的**互信息**，我们能够让模型做到很好的 exploration。
175 | 
176 | 在不知道会给出怎样测试任务的时候，尽可能到达更多的 state 是个好选择；并且，如果假设在 test 的时候会测试表现最差的 task，最大化 outcome 的熵是最优的选择。
177 | 
178 | # Reference Papers
179 | 
180 | 1. [Visual Reinforcement Learning with Imagined Goals](https://arxiv.org/abs/1807.04742)（介绍Imagining Goals）
181 | 2. [Skew-Fit: State-Covering Self-Supervised Reinforcement Learning](https://arxiv.org/abs/1906.05274)（介绍Skew-Fit）
182 | 3. [Efficient Exploration via State Marginal Matching](https://arxiv.org/abs/1806.04640)（介绍SMM）
183 | 4. [Provably Efficient Maximum Entropy Exploration](https://arxiv.org/abs/1812.02690)（variant of SMM）
184 | 5. [Diversity is All You Need](https://arxiv.org/abs/1802.06070)（介绍Skills-conditioned）


--------------------------------------------------------------------------------
/lecture/notes-zh/15-offline-RL_1.md:
--------------------------------------------------------------------------------
  1 | # Offline Reinforcement Learning 
  2 | 
  3 | 实验上，人们会注意到一个问题：RL的方法相比于DL而言，generalize的能力还是较差。比如，DL的图片模型在训练后可以在测试集的完全未知的图片上也有很好的效果，但RL的模型基本上集中在一个具体的环境，也就是说只能在这一环境上有效。那么，假设我们希望某个模型（比如说）可以做多种家务，应该怎么办呢？
  4 | 
  5 | 一个自然的想法就是，我们增加数据集的大小。但问题在于，前面给出的RL算法中数据集是由agent和环境交互产生的，如果这个数量过大，和环境交互的代价就会很大。回想在DL的时候，比如有人收集好了一个数据集，所有人都可以用；但RL中，每个人都要重新和环境交互。
  6 | 
  7 | 正是为此，**Offline RL**（也叫做batch RL或者fully off-policy RL）被引入：它的目标是，只要一个人用某种sample policy $\pi_\beta$ 收集好了一个数据集，之后所有人都可以用这个数据集来训练自己的模型。如图所示。
  8 | 
  9 | ![](./assets/15-1.png)
 10 | 
 11 | 同时，必须注意，这里的 $\pi_\beta$ **并非**前面我们在imitation learning中说的expert policy，而可以是任意的policy。
 12 | 
 13 | > 举一个例子，对于开车的问题，数据集可能是若干数据的混合：比如让一个高手开车采集出几条轨迹，再让普通人采集一些轨迹，甚至还可能让一些不太会开车的人采集一些轨迹。但是Offline RL需要做的是，不管数据整体多么拉，我们都可以学到这个数据集中最好的policy。
 14 | 
 15 | Offline RL有什么用？除了前面说到的，可以共用数据之外，在某些特殊领域其也有重要作用。比如，让agent为病人推荐药物来治疗，这样的问题显然不能让agent在真实环境中随便尝试。此时，我们就可以希望Offline RL能利用历史上的数据，做出一个不错的决策。此外，Offline RL还可以用作initialization：实验上发现在Offline RL的训练基础上稍微用online RL进行一下finetune，效果就会特别好。
 16 | 
 17 | ## Common Mistakes
 18 | 
 19 | 一些常见的错误：
 20 | 
 21 | 1. Q: Offline RL和Imitation Learning是一样的？
 22 |     - A：不是。比如上面的例子，Imitation Learning在理想的情况下学会的是人们平均的开车能力，而我们应该期待好的Offline RL学到更好的开车能力。
 23 | 
 24 | 2. Q: Offline RL是不是相当于提取出数据集中价值大的部分，然后做Imitation Learning？
 25 |     - A：不完全正确。我们确实希望，Offline RL可以“取其精华”，但是这个过程并不是简单的提取。
 26 | 
 27 |         一个著名的用Offline RL完成的实验是：在一个环境里有一个抽屉，里面有一个物体，agent需要打开抽屉，拿出物体。训练数据里面虽然有打开抽屉的轨迹和拿出物体的轨迹，但不存在任何一个完整的，既包含打开抽屉又包含拿出物体的轨迹。但是，Offline RL可以学会把这两部分的数据结合起来，完成这个完整的动作。
 28 | 
 29 |         这一现象也叫做“**Stitching**”，是好的Offline RL算法的一个重要特点。
 30 |     
 31 | 3. Q: 听你说的这么动人，那Offline Learning是不是在给任意的数据集的情况下都可以学到很好的policy？
 32 |     - A：不是。Offline RL的实际目的并不可能是学习**optimal policy**，而只能是**在给定数据集的基础上尽可能学习达到最好的效果**。很容易想像，如果数据集特别烂，那神仙来了也没有办法，更别提optimal policy了。但实际上对于正常的数据集，我们都希望Offline RL可以学到很好的policy。
 33 | 
 34 | ## Important Issue: Distribution Shift
 35 | 
 36 | 在了解了Offline RL的概念后，你可能立刻想到，我们为什么不能简单地直接用一个off policy的算法（比如Q learning）完成这一任务呢？原则上，完全没有任何问题。但实践上，这完全失败。其原因是不易察觉的。
 37 | 
 38 | 实验上发现一个现象：在失败的Offline RL中有着惊人的over-confidence：比如在Q-learning中，算法可能计算出来的Q value是 $10^{20}$ 量级，但实际上的return只有-100。这并不是巧合，而是有深意的。
 39 | 
 40 | 为了阐述这一问题，还是考虑驾车的例子：我们的数据集里面全部是人开车的数据。即使有水平比较差的人，他们至少也不会作出疯狂的事情。比如，在笔直的公路上，没有人会突然90度转弯。
 41 | 
 42 | 但是在Q-learning中，我们的模型并不知道这一点。它在公路上时，有可能sample到一个90度转弯的动作。但是，这个动作在数据集中是不存在的，甚至和数据集任何数据都没有相似之处，所以模型对其的估计可以任意离谱。这很有可能导致模型相信，在直路上转弯具有很大的Q value。
 43 | 
 44 | 那为什么这一问题在online的方法中没问题？因为，我们在online方法中（即便是off policy），总是有和环境的交互。这样，我们可以在模型对某个动作的估计过于离谱时，及时纠正。而在Offline RL中，我们没有这个机会。
 45 | 
 46 | 像前面说的90度转弯，就叫做 **out-of-distribution(OOD)** 的action。许多这样的action在一起就很可能导致模型的失败。什么时候会有很多这样的action？就是当我们测试的分布和训练的分布不一样（或者差距很大）的时候。这就叫做**distribution shift**。
 47 | 
 48 | 此外，注意必须区分out-of-distribution和out-of-sample的区别。out-of-distribution是离谱得离谱的action，而out-of-sample的action不一定很离谱，只是没有被采样到。比如，在路口，Q-learning的采样可能只采样了右转10度和20度的两组数据，此时右转30度就是out-of-sample的action；而在此突然熄火，就是out-of-distribution的action。
 49 | 
 50 | ### Reason of Over-Confidence
 51 | 
 52 | 前面，我们只是指出了这一问题，但为什么模型会过度自信？我们来研究一下。
 53 | 
 54 | 在经典的DL中，有一种方法叫做**advarsial attack**：比如，对于图片分类的模型，我们可以对图片梯度下降，只增加一个小小的扰动，在人看不出来图片任何变化的情况下改变模型的分类结果。这实际上并不是模型的锅，因为我们的训练目标本身就是
 55 | 
 56 | $$
 57 | L=\mathbb{E}_{x\sim D}\left[(f(x)-y)^2\right]
 58 | $$
 59 | 
 60 | 因此，模型好说的是平均好，**而非每个都很好，更不是对任意 $x\sim R^n$ 都好**。因此，只要有意地搞一个 $x$ ，很容易“攻破”这一模型。更进一步地，我们甚至不需要使用梯度的方法，只要我们选取一个极端的数值，比如
 61 | 
 62 | $$
 63 | x_0=\arg\max f(x)
 64 | $$
 65 | 
 66 | 那么 $(f(x_0)-y)^2$ 就很可能比较大，因为神经网络的各种峰值很可能是自己fit过去的，很难准确。
 67 | 
 68 | 但你会问，现在没有人来攻击我们的模型啊？实际上，是Q-learning的算法在攻击自己。具体地，我们的目标是
 69 | 
 70 | $$
 71 | L=\mathbb{E}_{s,a\sim \pi_\beta(s,a)}\left[\left(Q(s,a)-\left(r(s,a)+\gamma \mathbb{E}_{a'\sim \pi(a'|s')}[Q(s',a')]\right)\right)^2\right]
 72 | $$
 73 | 
 74 | 可以料想，如果 $\pi_\beta\approx \pi$ ，那么就相当于之前说的，在一个分布上训练，并在同样的分布上选取测试数据（在这里，“测试数据”实际上指的是下一轮Q backup的target，因此 $\pi_\beta$ 相当于“训练集”， $\pi$ 相当于“测试集”）。也就是说，对于 $\pi_\beta\approx \pi$ 的情况，或者之前虽然off-policy但仍然online的情况，问题都不大。
 75 | 
 76 | 但是现在 $\pi_\beta$ 固定不动，而 $\pi$ 的选取恰恰是
 77 | 
 78 | $$
 79 | \pi = \arg\max_{\pi}\mathbb{E}_{a'\sim \pi(a'|s')}[Q(s',a')]
 80 | $$
 81 | 
 82 | 这就好比是前面说的argmax攻击，会导致我们的模型不准确，或者几乎肯定比真值大很多！因此，几轮迭代下，Q value就会越来越大。
 83 | 
 84 | > Q：等等，你别蒙我，这不就是之前Q learning中也遇到的问题吗？我们不是可以用double Q learning解决吗？
 85 | >
 86 | > A: 不。这和前面online Q learning遇到的问题虽然相似，但原理上并不相同。原来的那个问题只是针对argmax本身；而现在的问题相当于在原来argmax问题的基础上进一步加剧。
 87 | >
 88 | > 原来的偏大对应着是正常训练一个神经网络（在同样的分布上面训练和测试），神经网络不可避免存在误差，而argmax定向地放大了这个误差；而现在的偏大对应着是在一个分布上训练，在另一个分布上测试，神经网络的误差就会比之前大得多，argmax之后的误差也就会比之前大得多。
 89 | >
 90 | > 因此，现在虽然使用double Q learning可以缓解这一问题，但并不能完全解决根本性的，由于两个分布不一致所造成的误差。为了解决这两个分布不同的distribution shift问题，我们必须使用一些新的方法。
 91 | 
 92 | # Importance Sampling in Offline RL
 93 | 
 94 | 一个试图解决distribution shift的方法是使用importance sampling。我们简要介绍一下这一方法，它解决了一些问题，但也有不足。比如，我们就采用policy gradients。
 95 | 
 96 | 回忆在[off-policy policy gradients](./5-policy_grad.md#off-policy-policy-gradients)中，我们给出的importance sampling的公式：
 97 | 
 98 | $$
 99 | \nabla_\theta J(\theta)=\mathbb{E}_{\tau\sim p_{\pi_{\beta}}(\tau)}\left[\prod_{t=1}^T\frac{{\pi_\theta}(a_t|s_t)}{{\pi_{\beta}}(a_t|s_t)}
100 | \left(\sum_{t=1}^T \nabla_{\theta}\log \pi_\theta(a_t|s_t)\cdot \hat{Q}_t(\tau)\right)\right]
101 | $$
102 | 
103 | 和之前一样，我们也希望使用first-order approximation，以避免连乘积带来的梯度爆炸或消失的问题。在数学上可以做严格的恒等变形，给出
104 | 
105 | $$
106 | \nabla_\theta J(\theta)=\sum_{t=1}^T\mathbb{E}_{\tau \sim p_{\beta}(\tau)}\left[\nabla_{\theta}\log \pi_\theta(a_t|s_t)\cdot \prod_{t''=1}^{t-1}\frac{{\pi_\theta}(a_{t''}|s_{t''})}{{\pi_{\beta}}(a_{t''}|s_{t''})}\sum_{t'=t}^T\left(\gamma^{t'-t} r(s_{t'},a_{t'})\prod_{t''=t}^{t'}\frac{{\pi_\theta}(a_{t''}|s_{t''})}{{\pi_{\beta}}(a_{t''}|s_{t''})}\right)\right]
107 | $$
108 | 
109 | 注意两个importance sampling的因子：第一个
110 | 
111 | $$
112 | \prod_{t''=1}^{t-1}\frac{{\pi_\theta}(a_{t''}|s_{t''})}{{\pi_{\beta}}(a_{t''}|s_{t''})}
113 | $$
114 | 
115 | 代表按照 $\pi_\theta$ ，在 $t-1$ 步走到 $s_t$ 的概率 $p_{\pi_\theta}(s_t)$ 和按照 $\pi_\beta$ ，在 $t-1$ 步走到 $s_t$ 的概率 $p_{\pi_\beta}(s_t)$ 的不同造成的修正。而第二个
116 | 
117 | $$
118 | \prod_{t''=t}^{t'}\frac{{\pi_\theta}(a_{t''}|s_{t''})}{{\pi_{\beta}}(a_{t''}|s_{t''})}
119 | $$
120 | 
121 | 代表后面的 $\hat{Q}$ 表达式中，因为 $\pi_\theta$ 和 $\pi_\beta$ 不同而造成的修正。
122 | 
123 | 相比于第一个因子，第二个因子对梯度爆炸的问题贡献还算小，因为只要 $\gamma$ 略小，就可以避免梯度的爆炸。但是第一个因子就没办法了。回想一下，原来的时候，我们宣称第一个因子可以扔掉：但这是因为当时对于off-policy方法， $p_{\pi_\theta}$ 和 $p_{\pi_\beta}$ 虽然不同但也比较接近。而现在，没有任何道理二者是接近的，因为 $\pi_\beta$ 是采集数据的那个人随便选的。
124 | 
125 | 因此，**在offline的场景中，我们不再能去除连乘积的影响**。实际上，理论上证明，要避免这个连乘积必须使用value-based method。
126 | 
127 | 但即使有连乘积，我们也可以通过某些办法来避免之前所说的梯度爆炸问题。下面就介绍两种这样的方法。
128 | 
129 | 为了简单起见，我们只考虑 **OPE(Offline Policy Evaluation)** 问题，也就是给定一个policy $\pi$ ，Offline地（即只利用用 $\pi_\beta$ 采样出的数据）给出
130 | 
131 | $$
132 | V^{\pi}(s_0)=\mathbb{E}_{\tau\sim p_{\pi}(\tau|s_0)}\left[\sum_{t=0}^{T}\gamma^t r(s_t,a_t)\right]
133 | $$
134 | 
135 | ## Doubly Robust Estimator
136 | 
137 | Doubly Robust Estimator 是一个统计学中的概念，大概是指给出一个估计，其中包含两个拟合的函数（比如神经网络）。一旦其中一个拟合的函数不准确，另一个拟合的函数可以弥补这一错误。
138 | 
139 | 在进入统计的概念之前，我们首先推导一个递推的关系。对于一条轨迹，从单采样的角度来看，
140 | 
141 | $$
142 | V^{\pi}(s_0)=\sum_{t=0}^{T}\gamma^t  r(s_t,a_t)\cdot \prod_{t'=0}^t\frac{\pi(a_{t'}|s_{t'})}{\pi_\beta(a_{t'}|s_{t'})}
143 | $$
144 | 
145 | 然后，如果令数列
146 | 
147 | $$
148 | \bar{V}^{T-t}=\frac{\pi(a_t|s_t)}{\pi_\beta(a_t|s_t)}(r(s_t,a_t)+\gamma \bar{V}^{T-t-1})
149 | $$
150 | 
151 | 且 $\bar{V}^{-1}=0$ ，那么就有 $V^{\pi}(s_0)=\bar{V}^{T}$ 。
152 | 
153 | Doubly Robust Estimator则是从递推出发，给出一个近似的方式：
154 | 
155 | $$
156 | \bar{V}^{T-t}=\hat{V}(s_t)+\frac{\pi(a_t|s_t)}{\pi_\beta(a_t|s_t)}(r(s_t,a_t)+\gamma \bar{V}^{T-t-1}-\hat{Q}(s_t,a_t))
157 | $$
158 | 
159 | 其中 $\hat{V}(s_t)$ 和 $\hat{Q}(s_t,a_t)$ 就是用来近似的函数。这个公式的意义在于，如果 $r(s_t,a_t)+\gamma \bar{V}^{T-t-1}-\hat{Q}(s_t,a_t)$ 就是0，那么后面的连乘积贡献会非常小；而即使后面不是0,也部分地缓解了连乘积的问题。Doubly Robust的帮助是使得这一递推表达尽量接近于原来严格的公式。
160 | 
161 | 当然，具体的细节也会比较复杂，可以参考[这里](https://arxiv.org/abs/1511.03722)。
162 | 
163 | ## Marginalized Importance Sampling
164 | 
165 | 这一思想更加直接：我们考虑一个全部（加权）marginalize掉的分布 $p_{\pi}(s,a)$ 和 $p_{\pi_\beta}(s,a)$ ，也就是
166 | 
167 | $$
168 | p_{\pi}(s,a)=(1-\gamma)\sum_{t=0}^T\gamma^t p_{\pi}(s_t=s,a_t=a)
169 | $$
170 | 
171 | 这样，我们就可以近似地有
172 | 
173 | $$
174 | V^{\pi}(s_0)=\mathbb{E}_{s,a\sim p_{\pi_\beta}(s,a)}\left[\frac{p_{\pi}(a|s)}{p_{\pi_\beta}(a|s)}r(s,a)\right]
175 | $$
176 | 
177 | 接下来，我们可以给出一个关于 $w(s,a):=\frac{p_{\pi}(a|s)}{p_{\pi_\beta}(a|s)}$ 的递推式：首先，考虑最后一步，得到
178 | 
179 | $$
180 | p_{\pi}(s,a)=(1-\gamma)p_0(s)\pi(a|s)+\gamma\sum_{s',a'}p(s|s',a')\pi(a|s)p_{\pi}(s',a')
181 | $$
182 | 
183 | 这样，我们就自然有
184 | 
185 | $$
186 | w(s,a)p_{\pi_\beta}(s,a)=(1-\gamma)p_0(s)\pi(a|s)+\gamma\sum_{s',a'}p(s|s',a')\pi(a|s)w(s',a')p_{\pi_\beta}(s',a')
187 | $$
188 | 
189 | 实验上，我们可以把左边和右边放入某个divergence，然后采样估计（参见[原论文](https://arxiv.org/abs/2002.09072)），并梯度下降给出最好的 $w$ 。有了 $w$ ，我们就可以立刻估计出 $V^{\pi}(s_0)$ ，从而在Offline的情况下完成policy evaluation。
190 | 
191 | # Offline RL: Old Methods
192 | 
193 | 我们在这一节介绍一些早期的Offline RL方法，它们虽然在实际中不太常用，但是对于理解Offline RL的概念还是很有帮助的。在这些早期方法中，没有神经网络，一切都是通过**线性回归**的方式进行近似。
194 | 
195 | 值得一提地，在以线性回归为基础的Offline RL研究中，distribution shift问题并不是很突出。但这种方法也注定无法应用到如今的复杂环境中，而且更是没有解决distribution shift的问题。
196 | 
197 | ## Linear Model
198 | 
199 | 我们把总共的 $|S|\times |A|$ 个(state,action) pair通过某种专家知识encode到一个 $K$ 维向量中。这个固定的 $(|S|\cdot|A|)\times K$ 矩阵称为feature matrix，记为 $\Phi$ 。然后，我们以后不再研究每个state或action本身，只研究这个向量。我们需要：
200 | 
201 | - **reward model**：给定一个 $K$ 维向量，用一个线性函数(对应的矩阵为 $w_r$ )输出reward。这一过程希望 $\text{OneHot}(s,a)\Phi w_r\approx r(s,a)$ 。
202 | - **transition model**：给定一个 $K$ 维向量，用一个线性函数(对应的矩阵为 $P_\phi$ )输出下一个state的向量。假设在(state,action)对上面的transition是 $P^\pi$ （注意，这和policy有关），那么我们希望 $\text{OneHot}(s,a)\Phi P_\phi\approx  \text{OneHot}(s,a) P^\pi\Phi$ ，也就是 $\Phi P_{\phi}=P^{\pi}\Phi$ 。
203 | - **(Q) value model**：给定一个 $K$ 维向量，(对应的矩阵为 $w_V$ )输出value function。注意因为我们一开始不知道value function，所以这一过程并不是简单的线性回归，而是需要求解递推。这一过程希望 $\text{OneHot}(s,a)\Phi w_q\approx Q^\pi(s,a)$ 。
204 | 
205 | 有了它们，我门就可以做OPE(Offline Policy Evaluation)了。具体地，我们先给出矩阵形式的递推：
206 | 
207 | $$
208 | Q^{\pi}=r+\gamma P^{\pi}Q^{\pi}
209 | $$
210 | 
211 | 其中， $r$ 代表把每一个 $(s,a)$ 的reward拼起来的列向量。虽然我们可以解出
212 | 
213 | $$
214 | Q^{\pi}=(1-\gamma P^{\pi})^{-1}r
215 | $$
216 | 
217 | 但这样的形式对应的计算量比较大，因此我们希望使用feature space将其压缩到比较小的维度。也就是
218 | 
219 | $$
220 | \Phi w_q\approx \Phi w_r+\gamma \Phi P_{\phi} w_q
221 | $$
222 | 
223 | 此时，我们就近似地有
224 | 
225 | $$
226 | w_q\approx (1-\gamma P_{\phi})^{-1}w_r
227 | $$
228 | 
229 | 再利用线性回归的最优解
230 | 
231 | $$
232 | P_{\phi}=(\Phi^T\Phi)^{-1}\Phi^T P^{\pi}\Phi,\quad w_r=(\Phi^T\Phi)^{-1}\Phi^T r
233 | $$
234 | 
235 | 代入，消去引入的中间变量 $P_{\phi}$ 和 $w_r$ ，就有
236 | 
237 | $$
238 | w_q\approx (1-\gamma (\Phi^T\Phi)^{-1}\Phi^T P^{\pi}\Phi)^{-1}(\Phi^T\Phi)^{-1}\Phi^T r=(\Phi^T\Phi-\gamma \Phi^T P^{\pi}\Phi)^{-1}\Phi^T r
239 | $$
240 | 
241 | 这一表达式称为Q function的**least-square temporal difference (LSTD)**。Temporal Distance (TD)指的就是bellman equation的误差项，而least-square就是指我们使用了线性回归的方法。利用这一表达式，我们就可以根据transition和reward的数据（这由环境提供）来估计q value function，从而实现policy的offline evaluation。
242 | 
243 | 当然，如果我们不把action embedd进入feature matrix，那么也可以得到正统的LSTD，也就是估计value function。这一方法相比于上面我们介绍的基于Q function的方法有许多缺陷，我们就不详细展开了。之后我们只考虑这种基于Q function的方法。
244 | 
245 | ## Case of Infinite State-Action Space
246 | 
247 | 我们上面的方法基于把一切的(state,action) pair通过一个feature matrix映射到一个有限维的向量空间中。如果我们的state和action是连续的，那么这一方法就不再适用。但稍微做一修改即可：我们的 $\Phi$ 不再是对于每一个可能的(state,action)给出一个向量，而是对**数据集中的**(state,action)给出向量。
248 | 
249 | 接下来，对于别的方面，我们只需要再做一些很小的修改即可。不妨假设我们只需要对数据集内部的 $s$ （或者 $(s,a)$ ）计算value（对于数据集外部的 $s$ ，我们确实无法保证任何东西），那么我们就还可以使用前面得到的
250 | 
251 | $$
252 | w_q=(\Phi^T\Phi-\gamma \Phi^T P^{\pi}\Phi)^{-1}\Phi^T r
253 | $$
254 | 
255 | 进行计算。但略需要注意的是，我们并不能得到真正的transition matrix $P^{\pi}$ ，而只能得到在数据集中的transition matrix。不过这也容易解决，注意到
256 | 
257 | $$
258 | (P^{\pi}\Phi)(s,a)=\sum_{s',a'}P^{\pi}(s',a'|s,a)\cdot \Phi(s',a')\approx \mathbb{E}_{s',a'\sim P^{\pi}(\cdot|s,a)}[ \Phi(s',a')]
259 | $$
260 | 
261 | 因此，我们可以通过单采样来估计这一表达式，也就是说，对于数据集的一组 $(s,a,s')$ ，可以近似地有
262 | 
263 | $$
264 | (P^{\pi}\Phi)(s,a)\approx \Phi(s',a')
265 | $$
266 | 
267 | 其中， $a'$ 应该从policy $\pi$ 中采样。这样我们就可以完全按照 $w_q$ 的表达式进行计算，唯一的误差来自于采样。
268 | 
269 | > 这里可能的一个误解： $\Phi$ 不是一个“字典”，记录大量(state,action)对应的feature；而是说， $\Phi(s,a)$ 对于任意的 $(s,a)$ 都是可以直接计算的，因为feature是专家知识或者一个pretrain好的神经网络给出的。只不过，在推导中，为了避免 $\Phi$ 是无穷维这一困难，我们只取数据集中的feature作为 $\Phi$ 。
270 | >
271 | > 因此，虽然从 $\pi$ 中采样得到的 $a'$ 不一定使得 $(s',a')$ 落入offline training的数据集中，但我们依然可以计算 $\Phi(s',a')$ 。
272 | 
273 | 更进一步地，我们既然可以完成policy的评估，我们也就很容易进行offline policy improvement。这样的方法称为LSPI(Least Square Policy Iteration)。其方法为：
274 | 
275 | > **LSPI Algorithm**
276 | 
277 | 重复：
278 | 
279 | 1. 利用当前的policy和固定不动的数据集计算 $w_q=(\Phi^T\Phi-\gamma \Phi^T \Phi')^{-1}\Phi^T r$ ，其中 $\Phi'$ 的构造方式是：如果 $\Phi$ 的某一行对应着 $(s,a)$ 的feature，那么 $\Phi'$ 的那一行就对应着 $(s',a')$ 的feature。（每次第二步更换 $\pi$ 之后，都要重新计算一次 $\Phi'$ 。）
280 | 2. 利用 $w_q$ ，更新： $\pi(s)\leftarrow \arg\max_a [\Phi(s,a)w_Q]$
281 | 
282 | 这就介绍完了早期的Offline RL方法。当然，还是如前面所说，它们无法从根本上解决distribution shift的问题。如何解决这一问题，我们将在下一节讨论。
283 | 
284 | # Reference Papers
285 | 
286 | 1. [Offline Reinforcement Learning: Tutorial, Review, and Perspectives on Open Problems](https://arxiv.org/abs/2005.01643)（对Offline RL的一个全面的介绍）
287 | 2. [Stabilizing Off-Policy Q-Learning via Bootstrapping Error Reduction](https://arxiv.org/abs/1906.00949)（解决distribution shift问题）
288 | 3. [Doubly robust off-policy value evaluation for reinforcement learning](https://arxiv.org/abs/1511.03722)（Doubly Robust Estimator）
289 | 4. [Consistent on-line off-policy evaluation](https://arxiv.org/abs/1702.07121)（Marginalized Importance Sampling）


--------------------------------------------------------------------------------
/lecture/notes-zh/16-offline-RL_2.md:
--------------------------------------------------------------------------------
  1 | # Offline Reinforcement Learning 2: Addressing Distribution Shift
  2 | 
  3 | 就如我们之前所说，distribution shift的问题是offline RL中最大的挑战之一。之前给出的方法都不够吊打，而本讲介绍的方法都是比较前沿的，在这方面做的很好的一批算法。
  4 | 
  5 | ## Introduction: Starting from a Naive Method
  6 | 
  7 | 我们先从最简单的思路出发。还记得，我们的问题是在update过程
  8 | 
  9 | $$
 10 | L_Q=\mathbb{E}_{s,a,s'\sim \pi_{\beta}}\left[(Q(s,a)-(r(s,a)+\gamma \mathbb{E}_{a'\sim \pi(\cdot|s')}[Q(s',a')]))^2\right]
 11 | $$
 12 | 
 13 | $$
 14 | \pi\leftarrow \arg\max_{\pi}\mathbb{E}_{s\sim \pi_\beta}\left[\mathbb{E}_{a\sim \pi(\cdot|s)}[Q(s,a)]\right]
 15 | $$
 16 | 
 17 | 中，因为 $\pi$ 和 $\pi_\beta$ 相差过大，所以很难保证 $Q$ 值的准确性。一个直接的想法便是，我们不如规定 $\pi$ 和 $\pi_\beta$ 不能差太远？比如，令
 18 | 
 19 | $$
 20 | \text{KL}(\pi||\pi_\beta)\le \epsilon
 21 | $$
 22 | 
 23 | 然后在这一约束下做前面的优化。
 24 | 
 25 | 抛开实现方法不谈，人们也要埋怨这一方法不够好：
 26 | - 一方面，这一方法太保守，以至于很难学到比较好的 $\pi$ 。比如说， $\pi_\beta$ 本身比较差，那么 $\pi$ 就算再好也被约束所限制了上界。
 27 | - 另一方面，这一方法的保守程度还不够。扪心自问，KL divergence小，一定代表着估计准确吗？就如同再精确的模型在训练集上也有错误，即使我们把两个分布做到KL divergence为0，也仍然会有错误的Q value的存在。而这样的错误就像滚雪球，使得Q value越来越大。
 28 | 
 29 | 因此，我们发现，KL divergence并不是我们想要的刻画这两个分布距离的方式。相反，我们希望的实际上是所谓**support constraint**：
 30 | 
 31 | $$
 32 | \pi(a|s)> 0 \text{ only when } \pi_\beta(a|s)\ge \epsilon
 33 | $$
 34 | 
 35 | 这一约束明显地更加合理，因为它反映了我们的真实需要——我们不希望policy通过做那些数据里面没有的action来exploit $Q$ 网络的漏洞。
 36 | 
 37 | 总而言之，对于约束而言，KL divergence不够合理，而support constraint虽然合理但更像一种理论上的结构，很难被实现。对于真实的方法，我们需要在这两者之间取得平衡。
 38 | 
 39 | ## First Trial: Using KL Divergence
 40 | 
 41 | 我们先介绍使用KL divergence的方法。我们只需要更改一下policy的update rule：
 42 | 
 43 | $$
 44 | \pi\leftarrow \arg\max_{\pi}\left(\mathbb{E}_{s\sim \pi_\beta}\left[\mathbb{E}_{a\sim \pi(\cdot|s)}[Q(s,a)]\right]-\lambda \mathbb{E}_{s\sim \pi_\beta}\left[\sum_a \pi(a|s)\log \frac{\pi(a|s)}{\pi_\beta(a|s)}\right]\right)
 45 | $$
 46 | 
 47 | 也就是
 48 | 
 49 | $$
 50 | \pi\leftarrow \arg\max_{\pi}\left(\mathbb{E}_{s\sim \pi_\beta}\left[\mathbb{E}_{a\sim \pi(\cdot|s)}[Q(s,a)+\lambda \log \pi_\beta(a|s)]+\lambda \mathcal{H}(\pi(\cdot|s))\right]\right)
 51 | $$
 52 | 
 53 | 一种方法是直接对这一目标进行优化。但我们实际上还可以再进一步，求导给出理论最优的 $\pi^\star$ （虽然由于神经网络表现力问题，这一 $\pi^\star$ 不一定可以被实现）：
 54 | 
 55 | $$
 56 | Q(s,a)-\lambda (\log \pi^\star(a|s)+1-\log \pi_\beta(a|s))-\mu=0
 57 | $$
 58 | 
 59 | （ $\mu$ 为一常数）也就是说
 60 | 
 61 | $$
 62 | \pi^\star(a|s)=\frac{1}{Z(s)}\pi_\beta(a|s)\exp\left(\frac{A(s,a)}{\lambda}\right)
 63 | $$
 64 | 
 65 | 其中， $A(s,a)$ 可以是 $Q(s,a)$ 减去任何一个不依赖action的值。因此，我们可以修改训练目标，这一新的训练目标的最优值仍为 $\pi^{\star}$ ：
 66 | 
 67 | $$
 68 | L_{\text{AWAC}}(\theta)=-\mathbb{E}_{s,a\sim \pi_\beta}\left[(\log \pi_\theta(a|s))\cdot\dfrac{1}{Z(s)}\exp\left(\frac{A(s,a)}{\lambda}\right)\right]
 69 | $$
 70 | 
 71 | 这一方法长得很像 behavior cloning 的 objective
 72 | $$
 73 | L_{BC}=\mathbb{E}_{s,a\sim \pi_\beta}\left[-\log \pi_\theta(a|s)\right],
 74 | $$
 75 | 只不过对每个数据 $(s,a)$ 增加了一个权重。直观上说，这个方法实际上是一种 behavior cloning，但更注重学习那些 $A(s,a)$ 较大的 behavior——也就是更优秀的 behavior。因此，它被称为 **AWAC (Advantage-Weighted Actor-Critic)**。
 76 | 
 77 | 其中的 $\lambda$ 被称为 **temperature**。 $\lambda$ 作为拉格朗日乘子，可以控制 $\pi$ 和 $\pi_\beta$ 的距离：数学上，当 $\lambda$ 趋于无穷，就变成了 MLE training， $\pi$ 会趋向于 $\pi_\beta$ ；当 $\lambda$ 趋于 0，$\pi$ 会趋向于argmax policy。
 78 | 
 79 | 实验上，可以证明，这样的objective相比于原来的objective表现更好。（它们在理论上是等价的，但实际上因为神经网络表现力有限，就有了优劣之分。）
 80 | 
 81 | 我们来总结一下AWAC算法的过程：
 82 | 
 83 | > **AWAC Algorithm**
 84 | 
 85 | 重复：
 86 | 
 87 | 1. 使用 $L_{\text{AWAC}}(\theta)=-\mathbb{E}_{s,a\sim \pi_\beta}\left[(\log \pi_\theta(a|s))\cdot\dfrac{1}{Z(s)}\exp\left(\dfrac{A_{\phi}(s,a)}{\lambda}\right)\right]$ 来训练 $\theta$ ；
 88 | 2. 使用 $L_Q(\phi)=\mathbb{E}_{s,a,s'\sim \pi_{\beta}}\left[(Q_\phi(s,a)-(r(s,a)+\gamma \mathbb{E}_{a'\sim \pi_\theta(\cdot|s')}[Q_\phi(s',a')]))^2\right]$ 来训练 $\phi$。（$L_Q$ 就是常规的 actor-critic loss）
 89 | 
 90 | ## Improvement
 91 | 
 92 | 我们还可以改进AWAC算法。第一步是核心步骤，比较好地限制住了 $\pi$ 和 $\pi_\beta$ 的距离，因此没什么可以改变的；但第二步略有缺陷：我们训练 $Q$ 的目的是提升我们的policy，但训练数据集一直在 $\pi_\beta$ 上面，这可能导致 $Q$ “抓不住重点”。
 93 | 
 94 | 比如，设想我们在训练开车，有一个很难转的弯，大部分人都转两次才能转过去；但数据集里也有几个专家，他们一次就转过去了。假设转两次的reward较低，而转一次的reward较高。那么，想一想：我们的policy是否能学会转一次？
 95 | 
 96 | > 实际上是很困难的，因为我们的数据集里有很多转两次的例子，所以 $Q$ 网络最赚的方式是拟合这些数据拟合的比较好，因此其不见得能意识到转一次的reward高很多。这样，我们的policy从Q网络中学习时，也不见得能学到转一次的action。
 97 | 
 98 | 因此，我们需要改进 $Q$ 网络的训练方式。我们需要想办法**对于每一个state，找到专家的action**。为此，与其在update中取期待值（对应着某种平均），我们不如取一个统计学中的**expectile**：
 99 | 
100 | $$
101 | \text{Expectile}_{\tau}[p(x)]=\arg\min_{y}[\text{ExpectileLoss}_{\tau,p(x)}(y)]
102 | $$
103 | 
104 | $$
105 | \text{ExpectileLoss}_{\tau,p(x)}(y)=\mathbb{E}_{x\sim p(x)}[(x-y)^2\cdot l^{\tau}(x-y)]
106 | $$
107 | 
108 | 其中函数 $l^{\tau}$ 定义为：
109 | 
110 | $$
111 | l^{\tau}(z)=\begin{cases}\tau,&z> 0\\ 1-\tau,&z\le 0\end{cases}
112 | $$
113 | 
114 | 这样，当 $\tau\to 1$ 的时候，我们就相当于取了一个最大值；而通过调整 $\tau$ ，我们可以保证 $y$ 是分布 $p(x)$ 中的较大值，但又不out of distribution。
115 | 
116 | 为了把expectile加入我们的训练过程，我们可以引入value function。这样，对 $V$ 的训练是：
117 | 
118 | $$
119 | L_V(\psi)=\mathbb{E}_{s\sim \pi_\beta}\left[\text{ExpectileLoss}_{\tau,Q_\phi(s,a)(a\sim \pi_\beta)}(V_\psi(s))\right]
120 | $$
121 | 
122 | 这里 $Q_\phi(s,a)(a\sim \pi_\beta)$ 指的是， $a\sim \pi_\beta(a|s)$ 的时候， $Q_\phi(s,a)$ 的分布。注意这里expectile loss没有用 $\pi$ ，因为我们希望从数据集中找到专家的action。对 $Q$ 的训练则是常规的：
123 | 
124 | $$
125 | L_Q(\phi)=\mathbb{E}_{s,a,s'\sim \pi_\beta}\left[(Q_\phi(s,a)-(r(s,a)+\gamma V_\psi(s')))^2\right]
126 | $$
127 | 
128 | 可以看到，这一过程中，没有明确地出现policy——这是因为这一方法通过expectile来改进策略，而非通过之前基于policy的方法。正是因为如此，这一方法被称为**IQL(Implicit Q-Learning)**。
129 | 
130 | > 当然，如果硬是要说，IQL“等效地”也有一个policy，使得 $\mathbb{E}_{a'\sim \pi(s')}[Q_{\psi}(s',a')]=V_{\psi}(s')$ ，也就是这个policy作出的效果和expectile一致。
131 | >
132 | > 这也说明了为何IQL会解决之前distribution shift的问题：在DQN中，我们相当于直接取argmax作为policy，这样如果Q 存在一些错误，那么policy就可能利用这些错误。而在IQL中，我们通过 $\tau$ 很大但不是1,实现了“在support中取最大”，从而保证policy不会被Q的少数错误所影响。
133 | 
134 | 当然，最后我们在eval的时候也需要学习一个policy。注意同样此时不能简单地取argmax，因此我们还是可以用AWAC的方法：
135 | 
136 | $$
137 | L_{\text{AWAC}}(\theta)=\mathbb{E}_{s,a\sim \pi_\beta}\left[(\log \pi_\theta(s|a))\cdot\exp\left(\frac{A_{\phi}(s,a)}{\lambda}\right)\right]
138 | $$
139 | 
140 | 并取 $A_{\phi}$ 为value和Q的差。当然，这里的value function相当于改变一个常数（baseline），因此不是关键；关键在于 $\lambda$ 还是保证了policy不要太远离数据集 $\pi_\beta$ 。总结一下，IQL的算法过程如下：
141 | 
142 | > **IQL Algorithm**
143 | 
144 | 1. 重复：
145 |     1. 使用 $L_V(\psi)=\mathbb{E}_{s\sim \pi_\beta}\left[\text{ExpectileLoss}_{\tau,Q_\phi(s,a)(a\sim \pi_\beta)}(V_\psi(s))\right]$ 来训练 $\psi$ ；
146 |     2. 使用 $L_Q(\phi)=\mathbb{E}_{s,a,s'\sim \pi_\beta}\left[(Q_\phi(s,a)-(r(s,a)+\gamma V_\psi(s')))^2\right]$ 来训练 $\phi$ ；
147 | 
148 | 2. 最后（在eval时）：
149 |     1. 使用 $L_{\text{AWAC}}(\theta)=\mathbb{E}_{s,a\sim \pi_\beta}\left[(\log \pi_\theta(s|a))\cdot\exp\left(\frac{A_{\phi}(s,a)}{\lambda}\right)\right]$ 来训练 $\theta$ 。
150 | 
151 | 可以看到，IQL通过使用expectile，相当于采用了support constraint的思想。可以料想到其表现会比AWAC更好。
152 | 
153 | ## Another Method: Controlling Q Values
154 | 
155 | 最后的一个方法另辟蹊径，我们来想：既然问题在于Q值可能会出现一些错误，而且偏大，我们能不能刻画这一点并把它纳入我们的训练目标呢？
156 | 
157 | 这一方法的关键思想是，我们首先搞一个虚假的policy $\mu$ ，专门用来攻击 $Q$ ，取出其最大值。然后，我们训练 $Q$ ，使得这个虚假的policy作出的action对应的Q值尽可能小。也就是
158 | 
159 | $$
160 | \arg\min_Q \max_{\mu}\mathbb{E}_{s\sim D,a\sim \mu(a|s)}[Q(s,a)]
161 | $$
162 | 
163 | 但稍微思考一下，会发现这并不行。我们制定目标的时候，总是要考虑网络是否会想到投机取巧的方法。比如这里， $Q$ 网络可能会发现，只要把所有的Q都下降就可以了。因此，一个更好的方式是，取
164 | 
165 | $$
166 | L_{Q,\text{additional}}=\alpha\cdot\left[\max_{\mu}\mathbb{E}_{s\sim D,a\sim \mu(a|s)}[Q(s,a)]-\mathbb{E}_{s\sim D,a\sim \pi_\beta(a|s)}[Q(s,a)]\right]
167 | $$
168 | 
169 | 其中后面的期望直接在数据集上面计算。理论上可以证明，这样的训练目标可以保证 $Q$ 的值比不加入这一项（ $\alpha=0$ ）的时候，期望下更加的小。这样，可以保证 $Q$ 值比较保守——不会被policy错误地利用。这一方法就叫做**CQL(Conservative Q-Learning)**。此时，在训练policy的时候，就可以直接采用最普通的DQN方法，比如直接取argmax。CQL的算法过程如下：
170 | 
171 | > **CQL Algorithm**
172 | 
173 | 重复：
174 | 
175 | 1. 使用 $L_Q(\phi)=\mathbb{E}_{s,a,s'\sim \pi_{\beta}}\left[(Q_\phi(s,a)-(r(s,a)+\gamma \mathbb{E}_{a'\sim \pi_\theta(\cdot|s')}[Q_\phi(s',a')]))^2\right]+L_{Q,\text{additional}}$ 来训练 $\phi$ ；
176 | 2. 训练policy： $L_{\pi}(\theta)=-\mathbb{E}_{s\sim \pi_\beta}\left[\mathbb{E}_{a\sim \pi_\theta(\cdot|s)}[Q(s,a)]\right]$
177 | 
178 | 不过，我们还可以再次改进这一方法。现在的这个 $\mu$ 会变成argmax策略，因此有点像“打地鼠”：有一个地方 $Q$ 值冒起头来，就把它打下去。这样虽然理论上没问题，但给人的感觉不够好，在训练的时候也不够稳定。因此，有人提出，我们加入一个对 $\mu$ 的限制，避免它是这样的one-hot policy：
179 | 
180 | $$
181 | L_{Q,\text{additional}}=\alpha\cdot\left[\max_{\mu}\left(\mathbb{E}_{s\sim D,a\sim \mu(a|s)}[Q(s,a)]+\lambda \mathbb{E}_{s\sim D}[\mathcal{H}(\mu)]\right)-\mathbb{E}_{s\sim D,a\sim \pi_\beta(a|s)}[Q(s,a)]\right]
182 | $$
183 | 
184 | 其中 $\mathcal{H}$ 的引入保证了 $\mu$ 这一分布不能太窄。此时，我们可以给出一个明确的 $\mu$ 的表达式：
185 | 
186 | $$
187 | \mu(a|s)=\frac{\exp(Q(s,a)/\lambda)}{\sum_{a'}\exp(Q(s,a')/\lambda)}
188 | $$
189 | 
190 | 其中 $\lambda$ 称为CQL temperature。代入，可以发现，
191 | 
192 | $$
193 | L_{Q,\text{additional}}=\alpha\cdot\left[\lambda \mathbb{E}_{s\sim D}\left[\log\left(\sum_{a}\exp\left(\frac{Q(s,a)}{\lambda}\right)\right)\right]-\mathbb{E}_{s\sim D,a\sim \pi_\beta(a|s)}[Q(s,a)]\right]
194 | $$
195 | 
196 | 这样的CQL方法的好处在于无需计算 $\mu$ ，速度更快。实际上也一般采取这种方法。
197 | 
198 | > **小贴士**
199 | >
200 | > 你知道`torch.logsumexp`吗？
201 | 
202 | # Model Based Offline RL
203 | 
204 | 可以发现，前面的方法的问题都来自于，我们只有一个固定的dataset，而且是由一个不一定好的policy $\pi_\beta$ 生成的。因此我们自然地想，能否把model-base方法引入，这样我们就可以获得更多的数据了！
205 | 
206 | 我们先回忆一下model-based方法是怎样工作的：我们收集一些数据，在上面训练一个model，然后利用环境的数据和model生成的新数据，进行一些off-policy的方法的训练。直观地想，这一方法完全可以用到现在offline的场景中：我们根据事先收集好的、固定的数据来训练环境model，然后利用这个model生成新的数据，进行policy的训练。
207 | 
208 | 但这有着很大的问题。在之前讨论model-based方法时候，我们都没有太关注下面图中所示的“闭环”，因为那是理所当然的。但现在，从 $\pi$ 到环境的交互这一条线断开了！设想我们的model犯了一个错误，认为在直路上向左转，得到的state才是向前的。那么，没有人来纠正它。
209 | 
210 | > 一个关键的对model-based方法的看法是，**model-based 方法中的model解答的是“what if”的问题**。 换句话说，model的用处在于，假设我的policy想在某个点上作出决策，数据不够的情况下model能做一些预测。这固然是好的，但是如果预测是错误的，那就还不如没有了。
211 | 
212 | ![](./assets/16-1.png)
213 | 
214 | 如何解决这一问题？简单来说，我们还是估计model的uncertainty。之前在普通的model-based方法中我们也提到过[估计uncertainty的必要性](./11-model-based.md#uncertainty)。但和之前不同，原来我们只是希望消除这个不确定性（比如，通过ensemble等方式）；但现在，我们连这也不敢了，因为任何微小的错误都是致命的。因此，我们应该**避免去往model不确定的地方**。
215 | 
216 | 带着这一思想，假设我们可以估计模型的uncertainty $u(s,a)$ ，那么我们就定义一个新的reward:
217 | 
218 | $$
219 | \tilde{r}(s,a)=r(s,a)-\lambda u(s,a)
220 | $$
221 | 
222 | 理论上可以证明，用这样的reward，有
223 | 
224 | $$
225 | \eta (\pi)\ge \eta(\pi')-2\lambda \epsilon (\pi')
226 | $$
227 | 
228 | 其中 $\pi$ 代表我们这样训练出来的policy，而 $\pi'$ 是任意一个policy。 $\epsilon$ 代表着这一policy走过的 $(s,a)$ 中，model的uncertainty的期望。这样，比如，把 $\pi'$ 取为最好的policy，那么就是说：**只要最好的policy走过的地方我们的model比较确定，那么我们的policy就可以接近最好的policy**。
229 | 
230 | 除此之外，我们还有一些其他的方法。
231 | 
232 | ## COMBO: Conservative Model-Based Offline RL
233 | 
234 | 和CQL一样，以C开头的算法强调保守：我们还是去打击一些东西。这一方法惩罚model生成的数据上面的 $Q$ value：
235 | 
236 | $$
237 | L_{Q,\text{additional}}=\beta\cdot(\mathbb{E}_{s,a\sim \hat{p}}[Q(s,a)]-\mathbb{E}_{s,a\sim \pi_\beta}[Q(s,a)])
238 | $$
239 | 
240 | 其中， $\hat{p}$ 代表我们的环境模型。
241 | 
242 | > Q: 你胡说八道。按照你的这个loss来，那就导致model生成的 $(s,a)$ 上面Q值都很小。这样，我们还要model干什么？
243 | >
244 | > A: 别急，并非如此。别忘了我们除了这个additional的loss之外还有一个正常的loss。设想现在model在某一个 $(s,a)$ 上面表现的很好（也就是和真实的环境数据相似），那么因为 $\beta$ 只是很小的数，正常的loss会主导，这样 $Q(s,a)$ 还是正常的数值。而只有当model预测了一个比较不合理的 $s'$ 的时候， $Q$ function之前的update完全没有作用，这个additional的loss才会变得主要，使得这一处的 $Q$ 变小。
245 | 
246 | 其实，这一方法背后的逻辑有点像之前exploration中提到的[RND](./13-exploration_1.md#counting-method-4-heuristic-estimation-using-errors)：通过某个网络（这里是 $Q$ 网络）找到模型不擅长的地方。然后，这一方法就可以减少这里的Q 值。
247 | 
248 | ## Trajectory Transformer: Model-based Method without Policy
249 | 
250 | 我们最后介绍一个比较新奇的方法：用transformer来当作model。
251 | 
252 | 如果仔细想一想最开始我们提出的问题：
253 | 
254 | > Offline RL的问题陈述是：我们有一组各种地方收集来的数据，可能没有一个是完整能完成任务的；但我们希望我们的model能做"Stiching"，集众家之长，来完成目标。
255 | 
256 | 我们会发现，sequence model实际上就可以完成这一件事——在autoregressive model的生成过程中，我们一般取probability argmax进行next token prediction，或者进行beam search。但现在，假设对于sequence
257 | 
258 | $$
259 | s_0 \to a_0 \to s_1 \to a_1 \to \cdots
260 | $$
261 | 
262 | 我们在每一次决定 $a$ 的时候，都采用reward而非probability来作为计算的标准；而每一次决定 $s$ 的时候，采用probability，那么这样的autoregressive生成就可以很好地刻画model-based policy optimization的过程。当然，贪心方法大概不够优，所以可能需要beam search。
263 | 
264 | 不仅如此，这一方法不再具有distribution shift的问题，因为language model本质就是取一个最大概率的sequence，所以这样的序列一定是在原来分布中概率比较高的。
265 | 
266 | 这就是trajectory transformer的基本思想。当然，实际应用中，还需要考虑很多细节，比如如何把连续的state和action离散化（不能简单地全部离散化，否则出来的个数是随着维度指数增大的；需要把每一个state拆为 $d_s$ 个token才可以。），等等。同时，它的代价也很明显，我们需要训练一个很大很大的模型，才能完成之前比较小的任务。
267 | 
268 | # Summary and Practial Tips
269 | 
270 | Offline RL迄今为止还是一个十分前沿的领域。因此，不同的问题的解决方法也没有固定的说法。这门课的老师[Sergey Levine](https://scholar.google.com/citations?user=8R35rCwAAAAJ)作为这一领域的专家，给出了如下的建议：
271 | 
272 | - 对于纯粹offline训练：
273 |     - 使用CQL或IQL
274 | - 对于offline pretrain和online finetune的这样的过程：
275 |     - 使用AWAC或IQL
276 |     - CQL不好，因为即使对于online，其policy也会被限制，不能达到最好的结果。
277 | - COMBO和Trajectory Transformer的使用一般在model比较简单的时候（比如，humanoid这样的简单的物理环境，trajectory transformer可以十分精确地预测100轮和环境的交互）。
278 | 
279 | 最后再回到之前的问题，我们为什么要研究offline RL？其实，在实际研究中，人们最重视的往往是数据集的重用性。比如，人们研究一种新提出的算法，在某个环境下失败了。随后，人们给出了这一算法的改进，那么原先失败的数据还可以拿过来重用，作为一个不错的initialization；而新的数据又可以被加入数据集。这样，每一点努力都没有被浪费。
280 | 
281 | # Reference Papers
282 | 
283 | 1. [AWAC: Accelerating Online Reinforcement Learning with Offline Datasets](https://arxiv.org/abs/2006.09359)
284 | 2. [Offline Reinforcement Learning with Implicit Q-Learning](https://arxiv.org/abs/2110.06169)
285 | 3. [Conservative Q-Learning for Offline Reinforcement Learning](https://arxiv.org/abs/2006.04779)
286 | 4. [MOPO: Model-Based Offline Policy Optimization](https://arxiv.org/abs/2005.13239)（惩罚模型不确定度的方法）
287 | 5. [COMBO: Conservative Offline Model-Based Policy Optimization](https://arxiv.org/abs/2106.11407)
288 | 6. [Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2104.13694)（Trajectory Transformer）
289 | 7. [Actionable Models: Unsupervised Offline Reinforcement Learning of Robotic Skills](https://arxiv.org/abs/2104.07749)（Unsupervised learning，基于goal-conditioned）


--------------------------------------------------------------------------------
/lecture/notes-zh/17-RL-theory.md:
--------------------------------------------------------------------------------
  1 | # Reinforcement Learning Theory Basics
  2 | 
  3 | ![](./assets/17-1.jpeg)
  4 | 
  5 | 在这一讲中，我们介绍基础的一些RL理论知识。首先，仅针对本讲，做一声明：
  6 | - 忽略global constant。比如，一个方法成功的概率是 $1-\delta$ ，和是 $1-2\delta$ ，没有区别。
  7 | - 不加说明的范数都是 $\infty$ 范数，也就是分量的最大值。
  8 | 
  9 | ## What we expect
 10 | 
 11 | 首先提问：我们希望从RL理论中获得什么？
 12 | 
 13 | > Student: 我们希望给出一个定量的刻画！比如说，我们设法计算出，discount是0.98的时候最终的policy获得的Q值的期望值是discount是0.99的时候的0.871倍。
 14 | >
 15 | > Teacher: 这基本不可能。你这好比用普通物理的知识分析一个实际汽车中的汽油发动机的输出功率：需要考虑的因素太多了，你很难确保你的结果是精确的。
 16 | >
 17 | > Student: 那……我们希望论证算法可以达到一个界？比如，给定一个算法，我们可以论证在任意的MDP问题下，以超过0.99的概率，它最终到达的Q值不差于最优策略的0.98倍？
 18 | >
 19 | > Teacher: 还是不可能。现代的RL方法，连收敛性都无法保证，更别提这种强的界了。
 20 | >
 21 | > Student: 唉，那我们到底能获得什么呢？
 22 | >
 23 | > Teacher: 我们可以获得一些**定性**的结论。比如说，某个算法跑出的策略和最优策略的差距，随着state space的增大是变大还是变小？随着horizon（定义为 $\frac{1}{1-\gamma}$ ）又是怎么变的？此外，还必须强调，即使是这样，我们也无法保证像你所说的，“对任意的MDP问题都给出保证”。实际上，我们往往需要**做很强的假设**。
 24 | 
 25 | 也就是说，RL理论的关键问题在于在很强的假设下分析极度简化的问题，并给出一种渐进成立的界。这也与普通的ML theory类似。
 26 | 
 27 | 更加specific地，我们一般研究两种问题：
 28 | 
 29 | - exploration：给定某个exploration方法，我们能否保证在 $|S|,|A|,T=\frac{1}{1-\gamma}$ 的**多项式**次和环境的交互后，到达某种几乎访问了所有有意义的(state,action)对的状态？
 30 | - policy learning：假设我们可以完全explore，那么学到的policy在真实的环境中表现如何？    
 31 |     - 这里，“完全explore”就是一种strong assumption，有几种阐述：
 32 |         - 有一个oracle，可以告诉我们任何一个(state,action)对的下一个state的概率分布；
 33 |         - 或者略弱一些，对于每一个(state,action)，我们都至少访问了 $N$ 次，得到了 $N$ 个next state的样本。
 34 | 
 35 | 注意通过这样，我们把policy learning和exploration解耦合，进而可以分别讨论。
 36 | 
 37 | # Problem 1: Policy Evaluation with Model
 38 | 
 39 | 我们先来研究这第一个问题。这一问题的假设是，我们对于每一个 $(s,a)$ ，从环境的真实分布 $P(s'|s,a)$ 中采样了 $N$ 个 $s'$ ，并使用统计的方法给出一个离散的categorical分布 $\hat{P}(s'|s,a)$ 。这一个 $\hat{P}$ 就相当于我们的"model"。
 40 | 
 41 | 然后，我们使用这一model进行 $Q$ value的训练。这一部分的算法不是我们关心的内容，因此假设可以学到准确的Q value，也就是bellman方程严格被满足。同时，利用这一Q value（记作 $\hat{Q}$ ），我们还可以得到一个最优的策略 $\hat{\pi}^{\star}$ ；而对于原先的环境，也有一个最优的策略 $\pi^{\star}$ 。我们从而定义下面的六个量：
 42 | 
 43 | | Environment | $P$ (real)| $\hat{P}$ (model) |
 44 | | --- | --- | --- |
 45 | | Policy= $\pi^{\star}$ | $Q^{\star}$ | $\hat{Q}^{\pi^\star}$ |
 46 | | Policy= $\hat{\pi}^{\star}$ | $Q^{\hat{\pi}^{\star}}$ | $\hat{Q}^{\star}$ |\
 47 | | Arbitrary Policy $\pi$ | $Q^{\pi}$ | $\hat{Q}^{\pi}$ |
 48 | 
 49 | 比如说，第一行第二列的意思是，使用 $\hat{P}$ 训练出来一个Q function，然后把 $\pi^{\star}$ 在这一Q function下的Q值记作 $\hat{Q}^{\pi^\star}$ 。
 50 | 
 51 | 在这一情况下，我们关心几个问题：
 52 | 
 53 | - 对于某个任意的policy $\pi$ ， ${Q}^{\pi}$ 和 $\hat{Q}^{\pi}$ 差多远？
 54 | - ${Q}^{\star}$ 和 $\hat{Q}^{\star}$ 相差多远？
 55 | - （终极问题，也是最有用的问题） $Q^{\star}$ 和 $Q^{\hat{\pi}^\star}$ 差多远？注意这一问题就是说我们的model最终能表现的多好。
 56 | 
 57 | 可以看到，前面两个问题相当于为最后一个问题做了铺垫。我们先研究第一个问题。
 58 | 
 59 | ## Distance of $Q^{\pi}$ and $\hat{Q}^{\pi}$
 60 | 
 61 | 首先，我们看一看我们有哪些已知的工具。根据假设，有
 62 | 
 63 | $$
 64 | Q^{\pi}(s_t,a_t)=r(s_t,a_t)+\gamma \mathbb{E}_{s_{t+1}\sim P(s_{t+1}|s_t,a_t)}[\mathbb{E}_{a_{t+1}}[Q^{\pi}(s_{t+1},a_{t+1})]]
 65 | $$
 66 | 
 67 | 我们可以试着把它写为矩阵的形式。为此，引入两个矩阵： $P\in \mathbb{R}_{(|S|\cdot |A|)\times |S|}$ 对应着环境转移的概率分布；而 $\Pi\in \mathbb{R}_{|S|\times(|S|\cdot |A|)}$ 对应着reward的期望。这样，我们可以写出
 68 | 
 69 | $$
 70 | Q=r+\gamma P\Pi Q
 71 | $$
 72 | 
 73 | 这样，我们就有 $Q^{\pi}=r+\gamma P\Pi Q^{\pi}$ 和 $\hat{Q}^{\pi}=r+\gamma \hat{P}\Pi \hat{Q}^{\pi}$ 。由此可以得到
 74 | 
 75 | $$
 76 | Q^{\pi}-\hat{Q}^{\pi}=\gamma \left({P}\Pi {Q}^{\pi}-\hat{P}\Pi {Q}^{\pi}\right)+\gamma \left(\hat{P}\Pi Q^{\pi}-\hat{P}\Pi \hat{Q}^{\pi}\right)
 77 | $$
 78 | 
 79 | 这给出
 80 | 
 81 | $$
 82 | Q^{\pi}-\hat{Q}^{\pi}=(1-\gamma \hat{P}\Pi)^{-1}\cdot \gamma (P-\hat{P})\Pi Q^{\pi}
 83 | $$
 84 | 
 85 | 此时，如果我们估计 $Q^{\pi}$ 和 $\hat{Q}^{\pi}$ 的无穷范数（即最大绝对值），我们就有
 86 | 
 87 | $$
 88 | ||Q^{\pi}-\hat{Q}^{\pi}||_{\infty}\le \gamma \cdot \frac{1}{1-\gamma} ||\Pi Q^{\pi}||_{\infty}\cdot ||P-\hat{P}||_{\infty}
 89 | $$
 90 | 
 91 | 其中，我们使用了以下比较直观的结论：如果矩阵 $A$ 满足 $||A||_{\infty}<1$ （把矩阵的无穷范数定义为每一行的绝对值和的最大值），那么
 92 | 
 93 | $$
 94 | ||(1-\gamma A)^{-1}v||_{\infty}\le \frac{||v||_{\infty}}{1-\gamma},\forall v
 95 | $$
 96 | 
 97 | （这里略去证明，比如一个可能的方法是把左边展开为级数）。接下来，我们只需要考察如何估计等式右边剩下的两个无穷范数。首先，对于 $\Pi Q^{\pi}$ ，注意到它具有reward的量纲，因此我们必须有一个同样量纲的东西才能bound住它。不妨设 $r(s,a)$ 具有上界 $R_m$ ，那么可以给出
 98 | 
 99 | $$
100 | (\Pi Q^{\pi})(s)=\sum_{a}\pi(a|s)Q^{\pi}(s,a)\le \max_{a}[Q^{\pi}(s,a)]\le \frac{R_m}{1-\gamma}
101 | $$
102 | 
103 | 而对于 $P-\hat{P}$ ，这一项实际上是在说，我们采样估计出来的model和真实情况比较相似。因此，我们需要使用一些估计采样的bound。某个比较高级的concentration inequality给出，对于 $(s,a)$ ，采样得到的total variational distance
104 | 
105 | $$
106 | \Pr\left\{\sum_{s'}\left|\frac{N(s',s,a)}{N(s,a)}-P(s'|s,a)\right|\ge \sqrt{|S|}\epsilon \right \}\le 2e^{-2N(s,a)\epsilon^2}
107 | $$
108 | 
109 | 这也就是说，
110 | 
111 | $$
112 | \left|\left|\hat{P}-P\right|\right|_{\infty}\le  \sqrt{|S|}\cdot \sqrt{\frac{\log \frac{1}{\delta}}{N}}\quad \text{w.p. } 1-\delta 
113 | $$
114 | 
115 | （其中 $N$ 是所有 $N(s,a)$ 的最小值）。这样，我们把所有这些结果总结起来，就可以得到：
116 | 
117 | $$
118 | ||Q^{\pi}-\hat{Q}^{\pi}||_{\infty}\le \gamma \cdot \frac{1}{1-\gamma}  \frac{R_m}{1-\gamma}\cdot \sqrt{\frac{|S|\cdot\log \frac{1}{\delta}}{N}} \quad \text{w.p. } 1-\delta
119 | $$
120 | 
121 | 这就给出我们第一个问题的界。我们稍微停下来一下来分析这一问题，可以发现， $||Q^{\pi}-\hat{Q}^{\pi}||_{\infty}$ 代表我们的model给出的policy $\pi$ 的评估和环境真正评估的差距。可以首先看到，它对于horizon $\frac{1}{1-\gamma}$ 是二次的，这很符合我们的直觉（还记得早在第二讲，我们就发现到一步一步这样的走，必定是 $O(T^2)$ 的误差，因此model-based方法采集出来的人造轨迹就必定是 $O(T^2)$ ）。其次，可以发现，这个误差和 $N$ 是根号反比的，因此增加model的训练的样本数，可以显著减小这个误差。
122 | 
123 | ## Solving the Full Problem
124 | 
125 | 令人惊讶的是，现在其实我们就可以完整解决之前提出的问题了！看起来，之前提出的三个问题我们只分析了其中一个，但第二个其实立刻可以估计出来。回顾一下，这一问题是说
126 | 
127 | > ${Q}^{\star}$ 和 $\hat{Q}^{\star}$ 相差多远？这里，前者 $Q^{\star}$ 是在真实环境下训练出来的最优policy的Q值，而后者 $\hat{Q}^{\star}$ 是在model下训练出来的最优policy的Q值。
128 | 
129 | 这相当于比较 $\sup f(x)$ 和 $\sup g(x)$ 之间的差距，而我们考虑的第一个问题相当于给出了对于同一个 $x$ ， $f(x)$ 和 $g(x)$ 的差距有多大。因此，可以给出
130 | 
131 | $$
132 | ||Q^\star-\hat{Q}^{\star}||_{\infty}=||\sup_{\pi}Q^{\pi}-\sup_{\pi}\hat{Q}^{\pi}||_\infty
133 | $$
134 | 
135 | $$
136 | \le \sup_{\pi}||Q^{\pi}-\hat{Q}^{\pi}||_{\infty} \le \gamma \cdot \frac{1}{1-\gamma}  \frac{R_m}{1-\gamma}\cdot \sqrt{\frac{|S|\cdot\log \frac{1}{\delta}}{N}} \quad \text{w.p. } 1-\delta
137 | $$
138 | 
139 | 也就是说，第二个问题对应的距离和第一个是差不多大的（或者不超过第一个的大小）。这样，我们就解决了第二个问题。最后，第三个问题就是前两个的简单组合：
140 | 
141 | $$
142 | ||Q^{\star}-Q^{\hat{\pi}^\star}||_{\infty}\le ||Q^{\star}-\hat{Q}^{\star}||_{\infty}+||\hat{Q}^{\star}-Q^{\hat{\pi}^\star}||_{\infty}
143 | $$
144 | 
145 | 这里第一项是第二个问题给出的距离，第二项则是第一个问题给出的距离，其中取 $\pi=\hat{\pi}^{\star}$ 。这样，我们就最终得出：
146 | 
147 | **误差**
148 | 
149 | $$
150 | ||Q^{\star}-Q^{\hat{\pi}^\star}||_{\infty}=\mathcal{O}\left(\frac{R_m}{(1-\gamma)^2}  \sqrt{\frac{|S|\cdot\log \frac{1}{\delta}}{N}} \right) \quad \text{w.p. } 1-\delta
151 | $$
152 | 
153 | # Problem 2: Policy Learning
154 | 
155 | 前面，我们已经分析了exploration，也就是建立model的准确性（在前面的分析中，我们考虑在model下面的optimal policy，也就是相当于假设有一个很好的算法可以在任意的环境下面学会optimal policy）。而现在，在这一问题中，两个地位调转了——假设我们已经完全了解了环境，也就是可以从transition中无限自由地采样，那么我们究竟能把policy学得多好？
156 | 
157 | 我们来分析fitted Q-iteration 算法。其实，早在第七讲最开始介绍Q iteration方法的时候，我们就已经进行了一定的分析。这里的其实也大同小异。
158 | 
159 | 对于理想中完全准确的 $Q$ iteration，我们有
160 | 
161 | $$
162 | Q_{k+1}\leftarrow TQ_{k}:= r+\gamma P \max_{a} Q_k
163 | $$
164 | 
165 | 这里 $T$ 就是Bellman operator。但是，现在有两个可能的误差来源：
166 | 
167 | - 采样误差：我们总归不能从环境中无限地采样，因此 $P,r$ 只能通过有限的采样来估计；
168 | - 近似误差：我们的 $Q$ 网络表现能力有限，因此拟合的时候可能存在误差。这里为了简单，我们假设Q 网络的学习目标是 **$\infty$ -范数**，而非2-范数。（对应地，我们假设误差很小是假设 $\infty$ -范数很小，这实际上是比较强的——如果假设2-范数很小，事实上并不能顺利地完成论证。）
169 | 
170 | 据此，我们记 $\hat{T}$ 为approximate Bellman Operator：
171 | 
172 | $$
173 | \hat{T}Q=\hat{r}+\gamma \hat{P} \max_{a} Q,\quad \hat{P}(s'|s,a)=\frac{N(s',s,a)}{N(s,a)},\quad \hat{r}(s,a)=\begin{cases}0&,N(s,a)=0\\r(s,a)&,N(s,a)>0\end{cases}
174 | $$
175 | 
176 | （注意对于没有经历过的 $(s,a)$ 对，我们只能把reward估计为0）而对于 $Q$ 网络的误差，我们假设
177 | 
178 | $$
179 | \hat{Q}_{k+1}=\arg\min_{\hat{Q}} ||\hat{Q}-\hat{T}\hat{Q}_{k}||_{\infty}
180 | $$
181 | 
182 | 并且
183 | 
184 | $$
185 | ||\hat{Q}_{k+1}-\hat{T}\hat{Q}_{k}||_{\infty}\le \epsilon
186 | $$
187 | 
188 | 我们的目标就是分析 $\hat{Q}_{k}$ 和 $Q^{\star}$ 的差距在 $k\to \infty$ 的行为。为此，我们分别处理两个误差来源，再设法把他们合并起来。
189 | 
190 | ## Analysis of Sampling Error
191 | 
192 | 首先，我们考虑采样误差。这一误差对应着 $||\hat{T}Q-TQ||_\infty$ 的估计，其中 $Q$ 可以是任何的 $Q$ function。通过直接的代入：
193 | 
194 | $$
195 | ||\hat{T}Q-TQ||_\infty\le ||\hat{r}-r||_{\infty}+\gamma ||\hat{P}-P||_{\infty}||\max_{a}Q||_{\infty}
196 | $$
197 | 
198 | 然后，我们观察到 $||\hat{r}-r||_{\infty}$ 是 $\mathcal{O}(R_m)$ 的；而 $||\max_{a}Q||_{\infty}$ 是 $\mathcal{O}\left(\frac{R_m}{1-\gamma}\right)$ 的。再结合Problem 1中我们分析的 $\hat{P}$ 的误差，我们可以得到：
199 | 
200 | $$
201 | ||\hat{T}Q-TQ||_\infty= \mathcal{O}\left(\frac{R_m}{1-\gamma}\sqrt{\frac{|S|\log \frac{1}{\delta}}{N}}\right)\quad \text{w.p. } 1-\delta
202 | $$
203 | 
204 | （这里我们和PPT里面的略有不同，因为PPT里面实际上认为reward $r(s,a)$ 也是随机的。这虽然更普适，但鉴于我们之前从没有考虑过这样的情况，引入比较突兀。）
205 | 
206 | 这样，我们就完成了对采样误差的分析。
207 | 
208 | ## Analysis of Approximation Error
209 | 
210 | 对于Approximation Error，我们类似于ML里分析梯度下降那样，来考察 $\hat{Q}_k$ 和目标 $Q^{\star}$ 的差距，并试图建立一个递推关系。有：
211 | 
212 | $$
213 | ||\hat{Q}_{k+1}-Q^{\star}||_{\infty}\le ||\hat{Q}_{k+1}-T\hat{Q}_{k}||_{\infty}+||T\hat{Q}_k-Q^{\star}||_{\infty}
214 | $$
215 | 
216 | （注意为了把两个影响分开，我们这里先考虑 $T$ ，而不是 $\hat{T}$ 。）注意到， $T{Q}^{\star}=Q^{\star}$ ，并且 $T$ 是一个contractive operator，因此有：
217 | 
218 | $$
219 | ||\hat{Q}_{k+1}-Q^{\star}||_{\infty}\le ||\hat{Q}_{k+1}-T\hat{Q}_{k}||_{\infty}+\gamma ||\hat{Q}_k-Q^{\star}||_{\infty}
220 | $$
221 | 
222 | 这样，我们就可以递推给出：
223 | 
224 | $$
225 | \lim_{k\to \infty}||\hat{Q}_k-Q^\star||_{\infty} \le \frac{1}{1-\gamma}\max_{k}||\hat{Q}_{k+1}-T\hat{Q}_{k}||_{\infty}
226 | $$
227 | 
228 | 这就是分析Approximation Error得到的结果。
229 | 
230 | ## Combining the Two Errors
231 | 
232 | 最后，我们把他们结合起来。这一步已经很明确了：
233 | 
234 | $$
235 | \lim_{k\to \infty}||\hat{Q}_k-Q^\star||_{\infty} \le \frac{1}{1-\gamma}\max_{k}||\hat{Q}_{k+1}-T\hat{Q}_{k}||_{\infty}
236 | $$
237 | 
238 | $$
239 |  \le \frac{1}{1-\gamma}\left[||\hat{Q}_{k+1}-\hat{T}\hat{Q}_{k}||_{\infty}+||\hat{T}\hat{Q}_{k}-T\hat{Q}_{k}||_{\infty}\right]
240 | $$
241 | 
242 | $$
243 | =\mathcal{O}\left(\frac{R_m}{(1-\gamma)^2}\sqrt{\frac{|S|\log \frac{1}{\delta}}{N}}+\frac{\epsilon}{1-\gamma}\right)\quad \text{w.p. } 1-\delta
244 | $$
245 | 
246 | 其中最后一步，前一项使用我们对approximation error的假设，而后面一项是sampling error分析得到的结论。
247 | 
248 | 再来观看这一结果，可以发现，我们还是获得了一个对horizon $\frac{1}{1-\gamma}$ 成平方关系的误差。这有些令人惊讶，说明即使是自由采样的情况下，由于Fitted Q iteration算法本身的原因和采样的有限，我们仍然会面临这个问题。当然，给定更强的假设，可以获得更强的结论，但这里就不介绍。
249 | 
250 | # Reference Papers
251 | 
252 | 1. [RL Theory Textbook](https://rltheorybook.github.io)


--------------------------------------------------------------------------------
/lecture/notes-zh/18-vae.md:
--------------------------------------------------------------------------------
  1 | # Variational Autoencoder
  2 | 
  3 | VAE (Variational Autoencoder)是一种很吊打的生成式模型。这本应属于DL的内容，也可以在[我一开始推荐的笔记](https://github.com/szjzc2018/dl/blob/master/note2.md#213-variational-autoencoder)里找到一个非常优秀、系统的讲解。因此，这里我们只是给这一模型做一简介。
  4 | 
  5 | ## Latent Variable Model
  6 | 
  7 | 我们先从latent variable model的概念开始：latent variable model就是指，为了输出 $p(x)$ ，我们给出
  8 | 
  9 | $$
 10 | p(x)=\int p_{}(x|z)p(z)dz=\mathbb{E}_{z\sim p(z)}[p_{}(x|z)]
 11 | $$
 12 | 
 13 | 这里 $z$ 的维度通常比较小，称为latent variable。 $p_{}(x|z)$ 是模型主要学习的东西，即通过latent variable $z$ 生成 $x$ ；而 $p(z)$ 是某种prior，有两种选择。
 14 | 
 15 | - 一种是直接给定一个先验分布，比如高斯分布；
 16 | - 另外一种则是让模型也可以改变 $p(z)$ ，但 $z$ 的维度比较小，因此可以再用另外一个生成模型来单独学习 $p(z)$ 。
 17 | 
 18 | 两种选择都有实际中的应用。在这里，VAE选择了前者。
 19 | 
 20 | ## MLE training and ELBO
 21 | 
 22 | 我们下一步需要考虑的就是，如何进行MLE training，也就是计算
 23 | 
 24 | $$
 25 | \log p(x)=\log\mathbb{E}_{z\sim p(z)}[p_{}(x|z)]
 26 | $$
 27 | 
 28 | 如果你已经~~被RL洗脑~~熟悉了RL的思想的话，你可能会想到直接从高斯分布采样出~~一个~~若干个 $z$ ，然后估计 $\log p(x)$ 。这样理论上并不是不行，但是可能需要很多很多的样本才能使得训练有效（后面我们会进一步阐述这一点），因此在实际上很少用这样的方法。
 29 | 
 30 | 这里，VAE的思想就十分巧妙：我们**采用一种类似importance sampling的方法**。对于每一个 $x$ ，我们设计一个分布 $q_{x}(z)$ （比如，就是高斯分布）。这样，就有了
 31 | 
 32 | $$
 33 | \log p(x)=\log\mathbb{E}_{z\sim q_{x}(z)}\left[\frac{p_{}(x|z)p(z)}{q_{x}(z)}\right]\ge \mathbb{E}_{z\sim q_{x}(z)}\left[\log\frac{p_{}(x|z)p(z)}{q_{x}(z)}\right]
 34 | $$
 35 | 
 36 | 这种通过放缩来估计 $\log p(x)$ 的方法，被称为**Variational Inference**。注意到，右边是 $\log p(x)$ 的一个lower bound，并且 $q_x(z)$ 和 $p(z)$ 越接近，这一lower bound越接近我们原来的目标（也就是 $\log p(x)$ ）。因此，我们也把它叫做**ELBO**（Evidence Lower Bound Objective）。
 37 | 
 38 | 我们转而来优化ELBO！首先注意到它和 $\log p(x)$ 原来的表达式不同，可以直接计算。这是因为
 39 | 
 40 | $$
 41 | \text{ELBO}= \mathbb{E}_{z\sim q_{x}(z)}\left[\log p_{}(x|z)\right]-\text{KL}(q_{x}(z)||p(z))
 42 | $$
 43 | 
 44 | 而右边的第一项可以通过采样估计，第二项则是高斯分布的KL divergence，具有闭形式。因此，我们可以通过梯度下降来优化ELBO。
 45 | 
 46 | 其次，ELBO还有**另外一种**变形方式，这一种方式告诉我们ELBO和真正的log prob的距离：
 47 | 
 48 | $$
 49 | \log p(x)-\text{ELBO}= \mathbb{E}_{z\sim q_x(z)}\left[\log \frac{p(x)q_x(z)}{p_{}(x|z)p(z)}\right]=\text{KL}(q_x(z)||p(z|x))
 50 | $$
 51 | 
 52 | 因此，我们知道了：假设我们能够让 $p(z|x)$ 和 $q_x(z)$ 完全接近，那么 $\log p(x)$ 和ELBO的差距就是0，也就是说，我们的ELBO就是真正的log prob。
 53 | 
 54 | ## Reframing VAE as a Stochastic AutoEncoder
 55 | 
 56 | 上面的一系列推导过于突然，可能会产生许多问题。因此，我们需要先给上面的数学公式一些解释。
 57 | 
 58 | > Q: 等等，我觉得你在骗人啊！你刚才说，ELBO的第一种表达式里面的第一项，也就是 $\mathbb{E}_{z\sim q_{x}(z)}\left[\log p_{}(x|z)\right]$ ，可以通过采样计算。但如果这样的话，为什么我们不直接采样计算 $\mathbb{E}_{z\sim p(z)}\left[\log p_{}(x|z)\right]$ 呢？这两者有任何差别吗？？
 59 | >
 60 | > A: 你先别急。还记得我们一开始说的吗，理论上采样计算 $\mathbb{E}_{z\sim p(z)}\left[\log p_{}(x|z)\right]$ 也是可以的，只不过可能需要很多很多的样本。至于为什么采样计算 $\mathbb{E}_{z\sim q_{x}(z)}\left[\log p_{}(x|z)\right]$ 需要的样本数远少于前者的样本数，就需要我们来从这些公式的意义上阐述了。所以，先听我说——
 61 | 
 62 | ### What is a VAE, Anyway?
 63 | 
 64 | 我们先看看，什么是 $x$ 和 $z$ ？ $x$ 很简单，就是输入的图片，其分布是我们的模型想要学习的；而 $z$ 则略抽象，我们说它是latent variable；一般来说，实践上， $z$ 的维度会比 $x$ 小很多。因此，**可以把 $z$ 看作 $x$ 的一种"压缩"**。
 65 | 
 66 | 那 $p(x|z)$ 和 $q_x(z)$ 这两个主要的分布又是干什么的呢？我们好像突然发现一些玄机：如果把 $x$ 和 $z$ 的对应关系比作文件和压缩文件的关系，那么 $p(x|z)$ 完成的工作就是**解压缩**，只不过因为 $z$ 毕竟损失了信息，所以解压缩出来的是一个**分布**。我们再来看 $q_x(z)$ ，它本来没有任何意义，只不过是我们选来做importance sampling的一个分布；但我们前面看到，作为最后的目标，我们会希望 $p(z|x)$ 和 $q_x(z)$ 完全接近。因此，理论上，**我们会希望 $q_x(z)$ 类似一个从 $x$ 到 $z$ 的“压缩”过程**。同样，这一压缩也给出一个**分布**（[之后](#elbo-revisited)我们会讨论这一点）。
 67 | 
 68 | 其实，这种思想在ML中很早就有，被称作**AutoEncoder**（AE）。AutoEncoder这一名字虽然听起来很高级（“自编码器”），但实际上就是我们前面所说的压缩-解压缩系统。完成压缩过程的网络叫做**Encoder**，而完成解压过程的网络叫做**Decoder**。这样，对号入座，我们就可以引入两个网络：**Encoder** $p_{\theta}(x|z)$ ，和**Decoder** $q_{\phi}(z|x)$ 。
 69 | 
 70 | > Q: 等等，为什么 $q_{x}(z)$ 变成了 $q_{\phi}(z|x)$ 这个条件分布？
 71 | >
 72 | > A: 对的——你是不是觉得，应该对每一个 $x$ 训练一个分布 $q_{x}(z)$ ？理论上也不是不可以，但这样的开销太大了，因此不如把 $x$ 直接当作神经网络的输入。这种方法很常见，叫做Amortized Variational Inference。
 73 | 
 74 | 此时，我们就可以大致地想像VAE的全局：**对于每一个图片 $x$ ， $q_{\phi}(z|x)$ 把它压缩到 $z$ 所处的空间（叫做latent space）里的一片区域。虽然这片区域对于人类来看是一堆乱码，但对于decoder $p_{\theta}(x|z)$ 来说，它十分有意义，因此decoder可以完成解压，把 $z$ 再还原回 $x$ 。**
 75 | 
 76 | ![](./assets/18-1.png)
 77 | 
 78 | 至此，我们也终于可以解释一开始挖的坑了：为什么采样计算 $\mathbb{E}_{z\sim q_{x}(z)}\left[\log p_{}(x|z)\right]$ 来训练，这样需要的样本数远少于用 $\mathbb{E}_{z\sim p(z)}\left[\log p_{}(x|z)\right]$ 训练需要的样本数？道理实际上很简单——如果 $z$ 从 $p(z)$ 取样，那就是东一榔头西一棒子。而我们的目标是希望**对于所有这样的高度随机的 $z$ 给出 $p(x|z)$ 都要比较大**，虽然不是不行，但对模型的要求就很高；不仅如此，训练的时候梯度也是东踹一脚西踹一脚，很难稳定。
 79 | 
 80 | 但用 $q_{x}(z)$ 的话，就完全不同了。因为我们的模型（参数 $\phi$ ）也在学习 $q_x(z)$ ，因此它完全可以给出一种structure：比如，对于MNIST的数字生成，模型可能学会把latent space劈10份。如果我看出来你这个数字是8，那就把你扔到第8个区域里。它还可以学会再细分，比如我看出来你这个数字风格潇洒飘逸，就分到这个区域上面；如果稳重隽秀，那就分到下面。而decoder经过训练，也就知道了这些“暗号”，比如它拿到latent space里面第8个区域中上面部分的 $z$ ，就知道这是一个潇洒飘逸的8，这样就有了信息，生成起来也就更加容易了。
 81 | 
 82 | （这段说的实际上不够准确，实际上模型做的肯定会比这个更复杂，因为 $q_{x}(z)$ 在训练过程中被限制的必须和高斯分布接近。但模型还是能建立起来一些结构的，只不过肯定不是这样human-readable的了。）
 83 | 
 84 | ### ELBO Revisited
 85 | 
 86 | 我们最后再来看我们前面数学上得到的ELBO。它现在已经变得十分有意义了。
 87 | 
 88 | 前面我们给出了ELBO的两种表达式。它们分别是：
 89 | 
 90 | $$
 91 | \text{ELBO}= \mathbb{E}_{z\sim q_{\phi}(z|x)}\left[\log p_{\theta}(x|z)\right]-\text{KL}(q_{\phi}(z|x)||p(z))
 92 | $$
 93 | 
 94 | $$
 95 | \text{ELBO}=\log p(x)-\text{KL}(q_\phi(z|x)||p(z|x))
 96 | $$
 97 | 
 98 | 第二个表达式还是之前那个意思：为了优化算不出来的 $\log p(x)$ ，我们先优化（最大化）ELBO。如果最后 $q_{\phi}(z|x)$ 所幸能和 $p(z|x)$ 很接近，那么我们就完成了目标。
 99 | 
100 | 而第一个表达式更加有趣了（注意训练的时候我们使用的是这个第一个表达式）。对于第一项，我们发现它的意思是：对于一个 $x$ ，我们压缩到latent space成为一个分布，再解压缩回去得到 $x$ 的分布，此时原始图片的概率比较大。也就是说，**经过压缩-解压缩过程，很大概率我们保证图片不变**。因此，这也叫做**Reconstruction Loss**。
101 | 
102 | 第二项是什么意思呢？这就是VAE和AE的不同之处了。如果我们的目的只是做一个AE，那么有第一项就足够了。这是因为，AE的目标只是learn a representation，因此数据集里面对应的 $z$ 可以任意复杂。但是，VAE需要我们生成图片，而生成的时候我们没有 $x$ ！（我们不能再从 $q_{\phi}(z|x)$ 中采样了！）
103 | 
104 | 因此，对于VAE，生成模型的本性使得它必须控制 $z$ 的分布，使得我们最后可以从这一分布中采样。因此，第二项的意思实际上是，**我们控制 $q_{\phi}(z|x)$ 必须是标准高斯分布，这样我们训练好的模型才能真正“投入生产”，生成图片**。（设想我们不把第二项加入训练目标，那么loss依然可以降的很小，但是生成出来的图片就会是一坨答辩。）这一项也因此称为**KL Penalty**。
105 | 
106 | 同时，这也解释了为什么我们之前说 $q_{\phi}(z|x)$ 和 $p_{\theta}(x|z)$ 必须是分布，而不能是一个确定的值。后者当然是因为我们要做MLE training，不得不给一个概率分布；而前者恰恰就是因为我们希望 $z$ 的分布是一个标准高斯分布，而把 $q(z|x)$ 设置成分布有利于对每一个 $x$ 分别进行训练。
107 | 
108 | 因此，ELBO这一数学上的整体实际上是一个优秀的设计：它把AE的基本思想和生成模型的必然需要结合起来，最终创造出VAE这一强大的模型。
109 | 
110 | > Q: 等等，你刚才说，“如果最后 $q_{\phi}(z|x)$ 所幸能和 $p(z|x)$ 很接近，那么我们就完成了目标”，这真的能实现吗？
111 | >
112 | > A: 当然。如果理想情况下 $q_{\phi}(z|x)=p(z)$ ，和 $x$ 都没关系了，那肯定满足 $q_{\phi}(z|x)=p(z|x)$ 。虽然实际上这样的境界不能被达到，但是至少你可以想像，ELBO的两个表达式中的两个KL divergence的训练方向是一致的。
113 | 
114 | ## Conclusion
115 | 
116 | VAE的基本思想就介绍完了。如果这是你第一次接受，那么确实可能还有很多地方没有想清楚。但当你逐渐习惯了它的思想之后，你会发现这一切都十分自然。愿你能充分体会到它的美。
117 | 
118 | 接下来，我们略提一些实现VAE的技巧，因为它们的思想往往能应用到很多其他地方。
119 | 
120 | ## Tricks
121 | 
122 | ### Design models for VAE
123 | 
124 | 一个conventional的设计是
125 | 
126 | $$
127 | q_{\phi}(z|x)=\mathcal{N}\left(\mu_\phi(x),\exp(\text{diag} (\sigma_\phi(x)))\right)
128 | $$
129 | 
130 | $$
131 | p_{\theta}(x|z)=\mathcal{N}\left(\mu_\theta(z),1\right)
132 | $$
133 | 
134 | 这里为什么第二个分布的方差选为1,还是有点说法：就如我们之前所说， $p_{\theta}(x|z)$ 本来没有必要是随机的，只是因为我们优化的目标是 $\log p(x)$ ，因此确定的分布会使得loss成为inf。因此，这里的方差没有实际的意义。
135 | 
136 | 也正是因为如此，写VAE一个很容易犯的**错误**是， **在生成的时候仍然从 $p_{\theta}(x|z)$ 采样** 。如果你使用的是MNIST的数据，那么你相当于为每个分量不超过1的图片加了一个方差为1的噪声！正确的方法是，虽然我们训练的时候搞这一个“概率分布”，实际上我们清楚这个概率分布只是一个工具，因此我们**应该直接用 $\mu_\theta(z)$ 作为生成的图片**。
137 | 
138 | 也有人因此指出，这个1没什么特殊之处，可以换成其他的值。这就是著名的 **$\beta$ -VAE**。当然，第一次尝试的时候，还是建议使用1.
139 | 
140 | ### Reparameterization Trick
141 | 
142 | 有一个不易注意到的细节，就是我们ELBO中的第一项
143 | 
144 | $$
145 | \mathbb{E}_{z\sim q_{\phi}(z|x)}\left[\log p_{\theta}(x|z)\right]
146 | $$
147 | 
148 | 同时包含了两个模型 $\phi$ 和 $\theta$ ！虽然计算可以直接通过采样，但这样就丢失了对 $\phi$ 的梯度。（这好比训练autoencoder的时候只训练decoder！）因此，我们希望想办法计算这个期望的梯度。
149 | 
150 | 当然，你也许早就知道，`torch.distributions` 的成员都可以`rsample`，并且得知这样得到的sample是可以反向传播的。而它的原理就是我们所说的reparameterization trick。对于高斯分布，这一方法实际上就是：
151 | 
152 | $$
153 | \mathbb{E}_{z\sim q_{\phi}(z|x)}\left[\log p_{\theta}(x|z)\right]=\mathbb{E}_{\epsilon\sim\mathcal{N}(0,1)}\left[\log p_{\theta}(x|z=\mu_\phi(x)+\Sigma_\phi(x)*\epsilon)\right]
154 | $$
155 | 
156 | 其中 $\Sigma_\phi(x)=\exp(\text{diag}(\sigma_\phi(x)))$ ，而`*`代表分量积。这样，我们发现 $\phi$ 就从下面转移到了上面，因此我们可以直接计算这个期望的梯度。
157 | 
158 | 值得一提的是，我们在第五讲一开始介绍policy gradient的时候，也遇到了完全一样的问题：需要优化的东西在期望符号底下。但当时我们通过数学上的一通变换解决了这一问题，当时的那一方法被称为**REINFORCE**。实际上，面对这种问题，REINFORCE和REPARAMETERIZE实际上是最常见的两类方法。如果非要分出高低，那么REPARAMETERIZE还是更加常用，因为经验上它具有更小的variance。
159 | 
160 | # VAE in RL
161 | 
162 | 最后，简单提一些VAE在RL中的应用。
163 | 
164 | - representation learning：我们在训练模型打游戏的时候，如果直接把pixel作为state，就会有很多问题（维度大，很多不同的图片实际上对应着类似的信息，等等）。因此，我们可以**用VAE先把图片压缩到 $z$ ，再在 $z$ 上训练RL agent**。
165 | - imitation learning：我们之前在imitation learning的过程中，提到不同的expert在面对state的时候可能作出不同的action（比如，前面是一棵树，滑雪运动员可能向左或者向右绕过）。这可能会给普通的概率模型 $\pi_{\theta}(a|s)$ 一些挑战，但VAE不会被多峰的分布难倒。因此，使用VAE做imitation learning是一个不错的选择。
166 | 
167 | # Reference Papers
168 | 
169 | 1. [Embed to Control: A Locally Linear Latent Dynamics Model for Control from Raw Images](https://arxiv.org/abs/1506.07365)（state-space model）
170 | 2. [SOLAR: Deep Structured Representations for Model-Based Reinforcement Learning](https://arxiv.org/abs/1808.09105)（state-space model）
171 | 3. [Learning Latent Dynamics for Planning from Pixels](https://arxiv.org/abs/1811.04551)（state-space model）
172 | 4. [Stochastic Latent Actor-Critic: Deep Reinforcement Learning with a Latent Variable Model](https://arxiv.org/abs/1907.00953)（state-space model）
173 | 5. [Dream to Control: Learning Behaviors by Latent Imagination](https://arxiv.org/abs/1912.01603)（state-space model）


--------------------------------------------------------------------------------
/lecture/notes-zh/2-imitation_learning.md:
--------------------------------------------------------------------------------
  1 | # Terminology
  2 | 
  3 | 先来回顾一下[Preliminaries](./0-preliminaries.md#preliminaries)中的内容，介绍一些符号——
  4 | 
  5 | - $o_t$ : observation at time $t$
  6 | - $a_t$ : action at time $t$
  7 | - $s_t$ : state at time $t$
  8 | - $\pi_\theta(a_t|o_t)$ ：policy
  9 | 
 10 | 什么是observation呢？这就引入了一个话题——我们有时候并不能观察到全部的state。
 11 | 
 12 | 比如说，在开车的时候，state应该包含其他所有车辆的位置、速度等等。但从车窗上的摄像头拍一张照片，我们显然不能获得全部的信息；相反，给定state，我们可以完全给出这张照片。这样的照片就是**observation**。
 13 | 
 14 | 在本笔记的绝大部分时候，我们总是可以忽略state和observation之间的差别；但有少数的情况，我们必须要区分它们，到时候会明确地说明。
 15 | 
 16 | 我们认为state是**完备的**。什么叫做完备？其实就是说，在已知 $s_t$ 的时候， $s_{t-1}$ 不能提供关于 $s_{t+1}$ 的更多信息。也就是
 17 | 
 18 | $$
 19 | s_{t-1}\perp s_{t+1}|s_t
 20 | $$
 21 | 
 22 | 满足这样的条件后，我们就可以发现， $s_{t+1}$ 只与 $s_t,a_t$ 有关，而这又只和环境有关（注意和policy无关，因为 $a_t$ 已经给出了）。这可以叫做dynamics，也可以叫做**transition probability**：
 23 | 
 24 | $$
 25 | s_{t+1}\sim p(s_{t+1}|s_t,a_t) \qquad (\text{determined by env})
 26 | $$
 27 | 
 28 | 我们可以发现，在这样的一系列假设后，我们可以构造一个Markov Chain，满足Markov Property。如图所示。
 29 | 
 30 | ![](./assets/2-1.png)
 31 | 
 32 | # Imitation Learning
 33 | 
 34 | 我们来介绍我们的第一个RL“算法”——Imitation Learning。
 35 | 
 36 | Imitation Learning 的思路很简单：我们找一个专家来label data，构建一个数据集
 37 | $$
 38 | \mathcal{D}=\{s,a\}
 39 | $$
 40 | 然后，我们训练一个模型，使得
 41 | $$
 42 | \theta^\star = \argmax_{\theta}\log \pi_\theta(a=\pi^\star(s)|s)
 43 | $$
 44 | 其中， $\pi^\star$ 是专家的策略。需要注意，这里完全没有RL的知识，只是普通的DL问题。这也有时候被叫做behavior cloning。
 45 | 
 46 | 模型的实现可以是任何网络, 比如 CNN, ViT。更进一步，有些研究考虑专家的Non-Markov性质，因此使用 RNN 来建模。
 47 | 
 48 | 
 49 | # Behavior Cloning Analysis
 50 | 
 51 | 直观上，bahavior cloning应该**失败**。
 52 | 
 53 | 比如说，一个模型在“走钢丝”，它每一步有98%的概率走在正确的道路上。这样的模型如果从我们训练数据的角度来看，已经是一个很不错的DL模型了。但是假设模型决策100步，那么只有13%左右的概率它依然保持在专家的道路上！
 54 | 
 55 | 不同于传统的 supervised learning, 因为每一步的状态是根据之前的 action 决定的, 所以并不满足数据的 i.i.d. assumption。所以, 在模型偏离训练数据的时候，我们并不能保证模型能做出正确的选择。
 56 | 
 57 | 但在某些场景下，Behavior Cloning 确实能 work：假如能通过训练数据学习到真正的 pattern。
 58 | 
 59 | 下面给出了一些数学上的分析。它们的主要目的都是为了bound住behavior cloing的模型和专家之间的差距。
 60 | 
 61 | ## Notations
 62 | - $a=\pi^{\star}(s)$ : the expert policy gives $a$ when the state is $s$
 63 | - $\pi_\theta$ : the policy we are trying to learn
 64 | - $p_{\pi_\theta}(s_t)$ : the probability of being at state $s_t$ at time $t$ if we follow $\pi_\theta$ . 
 65 |     - **重要提示**: $p_{\pi_\theta}(s_t)$ 的这个 $p_{\pi_\theta}$ 分布和 $p_{\pi_\theta}(s_{t+1})$ 的这个 $p_{\pi_\theta}$ 分布可不是一个分布！一个是在 $t$ 时的分布，一个是在 $t+1$ 时的分布。
 66 | - Use $|p_1-p_2|$ to denote the total variance distance between $p_1$ and $p_2$ : $|p_1-p_2|=\sum_{x}|p_1(x)-p_2(x)|$
 67 | 
 68 | ## Distribution Distance
 69 | **Assumptions.**
 70 | 
 71 | - $\forall (a,s), \pi_\theta(a\ne \pi^{\star}(s)|s)\le \epsilon$
 72 | 
 73 | **Conclusion**: 对任意的 $t$ ,
 74 | 
 75 | $$
 76 | \sum_{s_t}|p_{\pi_\theta}(s_t)-p_{\pi^{\star}}(s_t)|\le 2\epsilon t.
 77 | $$
 78 | 
 79 | 
 80 | **Proof**. 对 $t$ 归纳。
 81 | 
 82 | $$
 83 | \left|p_{\pi_\theta}(s_{t+1})-p_{\pi^\star}(s_{t+1})\right|=\left|\sum_{s_t,a_t}p(s_{t+1}|s_t,a_t)\pi_\theta(a_t|s_t)p_{\pi_\theta}(s_t)-\sum_{s_t}p(s_{t+1}|s_t,\pi^\star(s_t))p_{\pi^\star}(s_t)\right|.
 84 | $$
 85 | 
 86 | $$
 87 | =\left|\sum_{s_t}\left(\sum_{a_t\ne \pi^\star(s_t)}p(s_{t+1}|s_t,a_t)\pi_\theta(a_t|s_t)+p(s_{t+1}|s_t,\pi^\star(s_t))\pi_\theta(\pi^\star(s_t)|s_t)\right)p_{\pi_\theta}(s_t)-\sum_{s_t}p(s_{t+1}|s_t,\pi^\star(s_t))p_{\pi^\star}(s_t)\right|.
 88 | $$
 89 | 
 90 | $$
 91 | \le \epsilon \sum_{s_t}p_{\pi_\theta}(s_t)+\sum_{s_t}p(s_{t+1}|s_t,\pi^\star(s_t))p_{\pi_\theta}(s_t)\cdot \epsilon+\sum_{s_t}p(s_{t+1}|s_t,\pi^\star(s_t))\left|p_{\pi_\theta}(s_t)-p_{\pi^\star}(s_t)\right|
 92 | $$
 93 | 
 94 | $$
 95 | =\epsilon+\epsilon \sum_{s_t}p_{\pi_\theta}(s_t)p(s_{t+1}|s_t,\pi^\star(s_t))+\sum_{s_t}p(s_{t+1}|s_t,\pi^\star(s_t))\left|p_{\pi_\theta}(s_t)-p_{\pi^\star}(s_t)\right|.
 96 | $$
 97 | 
 98 | 对 $s_{t+1}$ 求和即证。
 99 | 
100 | *Side Note.* Homework 1 的 Problem 1 实际上给出了一个弱化的条件，依然可以给出同样的结论：
101 | 
102 | $$
103 | \mathbb{E}_{s_t\sim p_{\pi^\star}}[\pi_\theta(a_t\ne \pi^{\star}(s_t)|s_t)]\le \epsilon.
104 | $$
105 | 
106 | ## The total cost
107 | 
108 | **Assumptions.**
109 | - $\mathbb{E}_{s_t\sim p_{\pi^\star}}[\pi_\theta(a_t\ne \pi^{\star}(s_t)|s_t)]\le \epsilon.$
110 | 
111 | 如果我们定义 $c_t$ 是 **cost function**：
112 | $$
113 | c_t(s_t,a_t)=\begin{cases}0&,a_t=\pi^{\star}(s_t)\\1&,\text{otherwise}\end{cases}.
114 | $$
115 | 
116 | **Conclusion**: 
117 | 
118 | $$
119 | S=\sum_{t\le T} E_{s_t\sim p_{\pi_\theta}}[c_t(s_t,a_t)]=\mathcal{O}(\epsilon T^2).
120 | $$
121 | 
122 | 直观理解证明：如果模型错了一次，那么若要估计“错误步数”的上界，我们就必须假设从此往后模型的所有预测都是错误的。而通过上面的不等式，在第 $t$ 步的概率分布偏差不超过 $2\epsilon t$, 所以这一步对 $S$ 的贡献也不超过 $2\epsilon t$.
123 | 
124 | **Proof**. 
125 | 
126 | $$
127 | S=\sum_{t\le T} E_{s_t\sim p_{\pi_\theta}}[c_t(s_t,a_t)]=\sum_{t\le T} \sum_{s_t}p_{\pi_\theta}(s_t)c_t(s_t,a_t)
128 | $$
129 | 
130 | $$
131 | \le \sum_{t\le T} \sum_{s_t}p_{\pi^\star}(s_t)\pi_\theta(a_t\ne \pi^\star(s_t)|s_t)+\sum_{t\le T} \sum_{s_t}|p_{\pi_\theta}(s_t)-p_{\pi^{\star}}(s_t)|c_t(s_t,a_t)
132 | $$
133 | 
134 | 使用上一个结果，我们就可以得到
135 | 
136 | $$
137 | S\le \sum_{t\le T} \epsilon+\sum_{t\le T} 2\epsilon t = \mathcal{O}(\epsilon T^2).
138 | $$
139 | 
140 | # Make Bahavior Cloning Work
141 | 
142 | 介绍几个常见的方法, 解决bahavior cloning的这个问题。
143 | 
144 | ## Adding Mistakes
145 |  
146 | 假设我们的模型学会改正自己的错误（比如，在走钢丝的时候，身体向左倾倒的时刻，我们的模型能够自动调整身体向右倾倒）。这样的话，成功的概率会大很多。
147 | 
148 | 一个典型的实验是，我们做一个驾驶的模型，然后做三个摄像头：一个正常的摄像头，一个向左偏移的摄像头，一个向右偏移的摄像头。在训练的时候，左边摄像头的图片被标记为“右转”，右边摄像头的图片被标记为“左转”。这样的话，我们的模型就能够学会自动调整。
149 | 
150 | > 所以我们在训练数据中刻意**加入错误并最终改正**，能够训练模型纠正误差的能力。同时可以像上一个驾驶的例子一样，做 **Data Augmentation**。
151 | 
152 | ## Multi-task Learning
153 | 
154 | 还记得我们之前的问题：只要模型一步误入歧途，接下来就再也没有挽回的余地。回顾一下，之前的模型失误的时候会走向一条全新的道路，是完全没有训练过的；但 multi-task learning 可以解决这个问题——它通过巧妙的设计收集大量的 trajectory 信息，使得模型在哪里都不至于完全不知所措。
155 | 
156 | 具体地，我们在训练的时候让专家并非向往一个目标 $s_T$ 前进；相反，让它对很多个 $s_T$ 走多条这样的路径：
157 | 
158 | $$
159 | s_1,a_1,\cdots,s_{T-1},a_{T-1},s_T
160 | $$
161 | 
162 | 然后，我们的数据集收集
163 | 
164 | $$
165 | \{(a_t|s_t,g_t=s_T)\}\in \mathcal{D}
166 | $$
167 | 
168 | 也就是说，我们模型知道了对于每一个 **目标 $s_T$** 应该每一步怎样走。这样的操作也叫做 Goal-conditioned behavior cloning。
169 | 
170 | 论文 [Learning to Reach Goals via Iterated Supervised Learning](https://arxiv.org/abs/1912.06088) 指出，Goal-conditioned data 可以通过如下方式获得：根据当前的策略 $\pi$, 随机指定目标 $g$, 并根据 $\pi(a|s,g)$ 生成一条 trajectory。如果这条路径到达了另一个目标 $g'$, 就将这条路径重新标注为**目标为 $g'$ 的路径**加入数据中。
171 | 
172 | ## DAgger
173 | 
174 | DAgger也试着解决原来的问题。它的思路是，为了防止模型走错之后不知道该怎么走，我们就在每一个训练 iteration 完成之后让模型自己跑一次，并让专家来标记正确答案。具体地，我们从 $\pi_\theta$ 中采样
175 | 
176 | $$
177 | s_1,a_1\sim \pi_\theta(\cdot|s_1),\cdots,s_T,a_T\sim \pi_\theta(\cdot|s_T)
178 | $$
179 | 
180 | 然后把这些新的数据加入数据集中：
181 | 
182 | $$
183 | \mathcal{D}= \mathcal{D}\cup \left\{(s_t,a_t^\star=\pi^{\star}(s_t))|t=1,2,\cdots,T\right\}
184 | $$
185 | 
186 | 当然，实际上可能采用一些其他策略，比如每一次不是在越来越大的数据集上完整地训练一轮，而是把所有训练的数据存到一个 buffer 内部，然后从中随机地采样。
187 | 
188 | 当然， DAgger 也有很显著的问题：需要很多次专家进行数据的标注，因此这部分的代价可能会很昂贵。
189 | 
190 | # Reference Papers
191 | 
192 | 1. [A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning](https://arxiv.org/abs/1011.0686)
193 | 
194 | 2. [Learning to Reach Goals via Iterated Supervised Learning](https://arxiv.org/abs/1912.06088)


--------------------------------------------------------------------------------
/lecture/notes-zh/21-RL-LM.md:
--------------------------------------------------------------------------------
  1 | # RL and Language Models
  2 | 
  3 | ## Partially Observed Scenarios
  4 | 
  5 | 之前我们介绍的大部分问题都是fully observed的，因此也不用区分state和observation；但在很多问题中，我们观察到的observation $o_t$ 和state $s_t$ 并非一致。比如游戏图像并不包含全部的状态，或者对话的过程中历史也产生影响，等等。
  6 | 
  7 | 把这类问题formalize一下，就是所谓**POMDP**(Partially Observed MDP)，其结构如图所示：state $s_t$ 是和之前一样的，具有Markov property的；但我们的policy只能condition on $o_t$ ，但 $o_t$ 并不包含 $s_t$ 的全部信息。
  8 | 
  9 | ![](./assets/21-1.png)
 10 | 
 11 | 对于这样的问题，也有两种policy；第一种称为“memoryless”，也就是policy就是 $\pi(a_t|o_t)$ ，不依赖于 $o_{1..t-1}$ ；第二种则可以利用全部的 $o_{1..t}$ ，这更加普遍。
 12 | 
 13 | ### Basics of POMDP
 14 | 
 15 | POMDP可能比较奇怪。首先，可能有一些奇怪的action出现，其“牺牲小我成就大我”，也就是虽然这个action对于当前的reward是负面的，但收集到了信息，所以有助于未来的reward。比如说，在走迷宫的时候，最优的策略很可能是先随便试一试，再有策略地走。
 16 | 
 17 | 此外，对于普通的MDP，一定存在一个决定性的策略，不差于任何其他的策略（即为最优策略）；但对于POMDP并且memoryless的情况下，并非如此。
 18 | 
 19 | > 比如，考虑一个例子：state 有A,B,C三个；到达B获得reward并立刻结束，否则不获得reward。从A出发，一半概率留在A，一半到达B；从B出发到达A,C各一半；从C出发一半到达B，一半留在C。
 20 | >
 21 | > 我们再定义observation，使得它无法提供任何信息： $o_A=o_B=o_C$ 。这样，如果memoryless并且决定性，那只能不断向左或者向右，成功的概率只有一半；但一个memoryless并且非决定性的策略是一半概率向左，一半概率向右，这样一定能成功。
 22 | 
 23 | ### Method 1: Treating POMDP as MDP
 24 | 
 25 | 接下来，我们来研究如何解决POMDP的问题。一个很自然的想法就是，我们在之前的方法里面，直接把所有state替换为observation。这样行不行呢？这就取决于原来的方法是否依赖于 state 的 Markov property ($s_t$ 只和 $s_{t-1}$ 和 $a_t$ 有关，但在 observation 的情况下并非如此)。
 26 | 
 27 | 需要注意，这样给出的policy只能是 memoryless 的，因为原来的policy只依赖于当前的 $s_t$。
 28 | 
 29 | #### Policy Gradients
 30 | 
 31 | 替换后的第一种形式是
 32 | 
 33 | $$
 34 | \nabla_\theta J(\theta) = \mathbb{E}_{\tau\sim \pi_\theta}\left[\sum_t\nabla_\theta \log \pi_\theta(a_t|o_t)\hat{A}(o_t,a_t)\right],\quad (1)
 35 | $$
 36 | 
 37 | 其中
 38 | 
 39 | $$
 40 | \hat{A}(o_t,a_t)=\sum_{t'=t}^T \gamma^{t'-t}r(o_{t'},a_{t'}) - b(o_t).
 41 | $$
 42 | 
 43 | 是advantage；这里 $b(o_t)$ 是baseline。这一形式的推导是否依赖于Markov property呢？答案是**不依赖**。回顾 Policy Gradient 的推导过程，假设 $a_t$ 只依赖于 $o_t$，但是 $o_t$ 可能依赖 $o_{1:t-1}$，我们仍然可以写
 44 | $$\nabla_\theta J(\theta)=\mathbb{E}_{\tau\sim \pi_{\theta}}\left[\left(\sum_t \nabla_\theta \log \pi_\theta(a_t|o_t)\right)\left(\sum_t \gamma^t r(o_t,a_t)\right)\right].$$
 45 | 仔细观察可以发现：我们仍然可以假设当 $t_1>t_2$ 时， $\nabla_\theta \log \pi_\theta(a_{t_1}|o_{t_1})\cdot r(o_{t_2},a_{t_2})$ 项的贡献是 $0$（对 $a_{t_2+1:T}$ 积分）。所以 $(1)$ 式仍然成立。
 46 | 
 47 | 但是如果再进一步到actor-critic algorithm，就不行了：一个actor-critic的尝试是把advantage改为模型的近似：
 48 | 
 49 | $$
 50 | \hat{A}(o_t,a_t) \approx r(o_t,a_t) + \gamma V^{\pi_\theta}(o_{t+1}) - V^{\pi_\theta}(o_t)
 51 | $$
 52 | 
 53 | 这里其实就已经用到了markov property。原因是， $V^{\pi_\theta}(o_{t+1})$ 大概用来估计 $o_{t+1}$ 之后的所有reward的加权和，但这个和可能依赖于 $o_1,\cdots,o_{t+1}$ ，而不是只依赖 $o_{t+1}$ 。因此，actor-critic based methods并不能简单地做上面的替换。当然，就如上面所说，baseline的引入是可以的，因为它只是起到减小方差的作用。
 54 | 
 55 | #### Q-Learning and Value-based Methods
 56 | 
 57 | 根据前面的讨论，可以料想到，Q function和value function一般地都需要依赖于全部的history，而不仅仅是当前的observation。因此，简单的替换是不行的。
 58 | 
 59 | #### Model-based Methods
 60 | 
 61 | model-based method行不行？直观上，肯定不行，因为model学习的是 $p(o_{t+1}|o_t,a_t)$ ，对于 $o_{t+1}$ 而言只有 $o_t,a_t$ 是不够的。这里，也可以举出一些例子，但为了简单起见，这里不再详细讨论。
 62 | 
 63 | 接下来介绍的两种方法，State space Models & Recurrent Models，都没有忽略 $o_t$ 的历史依赖性，因此实践上更加有效。
 64 | 
 65 | ### Method 2: State space Models
 66 | 
 67 | 虽然 observations 并不满足 Markov property，我们能不能学习一个 hidden state $z$，使其满足 Markov property 呢？假如我们真的学到了这样的 $z$，那么 POMDP 就和普通的 MDP 没有任何区别了！
 68 | 
 69 | 具体来说，我们学习一个 **autoencoder**：latent variable $z$ 是 state；而 input $x$ 是 observation。在 decode 的时候，我们希望 $x_t$ 可以通过 $z_t$ 还原出来，所以我们写
 70 | $$
 71 | p_\theta(x|z)=\prod_t p_\theta(x_t|z_t);
 72 | $$
 73 | 而在 encode 的时候，$z_t$ 依赖于 $x_1,\cdots,x_t$，也即
 74 | $$
 75 | q_\phi(z|x)=\prod_t q_\phi(z_t|x_{1:t}).
 76 | $$
 77 | 我们希望 $z$ 的转移满足 Markov property，也即
 78 | $$
 79 | p(z)=p(z_1)\prod_t p_\psi(z_{t+1}|z_t,a_t).
 80 | $$
 81 | 
 82 | 我们依然可以使用ELBO作为目标来训练。回顾一下，它写为：
 83 | 
 84 | $$
 85 | \text{ELBO}= \mathbb{E}_{z\sim q_{\phi}(z|x)}\left[\log p_{\theta}(x|z)\right]-\text{KL}(q_{\phi}(z|x)||p(z))
 86 | $$
 87 | 
 88 | 注意到给定了 $x$ 之后， $z$ 的分布是独立的高斯乘积，因此很容易`rsample`；唯一的问题是 $p(z)$ 不是高斯分布，所以我们不能像 vanilla VAE 那样写出显式表达。我们有
 89 | 
 90 | $$
 91 | \text{KL}(q_{\phi}(z|x)||p(z))=\mathbb{E}_{z\sim q_\phi(z|x)}\sum_t \text{KL}(q_{\phi}(z_t|x_{1:t})||p(z_t|z_{t-1},a_{t-1})).
 92 | $$
 93 | 
 94 | 假设训练完成之后具有很小的Loss，那么我们就可以用 $z_t$ 当作state做之后的任务。这样，我们就可以用普通的RL方法来解决POMDP问题了。
 95 | 
 96 | 但这一方法有些过火——为了给出最好的model，我们学出一个很复杂的VAE。实际上，有可能不需要state也可以作出接近memory-ful的policy。因此，接下来探讨一些折中的方案。
 97 | 
 98 | ### Method 3: Recurrent Models
 99 | 
100 | 一个方法是，我们注意到，前面的方法里也是假设了通过 $o_{1..t}$ ，我们可以用decoder获得state $z_t$ 的全部信息。那么，我们不如把这一任务交给Q function自己来做。也就是说，我们就选取
101 | 
102 | $$
103 | s_t:= (o_1,\cdots,o_t)
104 | $$
105 | 
106 | 这一方法也因此称为**history states**。
107 | 
108 | > Random Thought: 如果假设使用 history states 的话，一切都是 markov model——因为这是我们能获得的最多的信息！
109 | 
110 | 这样，自然我们的Q网络就需要处理sequence的输入，因此我们需要采用sequence modeling。这就是 seq2seq model 在 RL 中的一处重要用途。
111 | 
112 | 当然，有一个比较好的计算技巧：在sample trajectory的时候，我们采集到一系列的 $o_1,\cdots,o_t$ ；每一次获得observation之后，我们都要用Q网络给出一个action。这就可能导致如果每一次都重新跑一遍模型，工作量会比较大（是 $t^2$ 量级）。但一个很好的方法是，对于RNN-based的模型，我们可以存下hidden state，这样的话输入一个新的 $o_t$ ，只需要进行一次计算即可。在实现的时候，我们可以把RNN hidden state存入replay buffer。对于transformer，需要采取别的优化方式。
113 | 
114 | ## RL for Language Models
115 | 
116 | （注：这部分结合了一部分guest lecture的内容来介绍）
117 | 
118 | 我们已经知道language mdoel，比如GPT，的基本工作原理；但GPT如它的名字那样，只是一个transformer decoder，只能做autoregressive的生成。从GPT到ChatGPT，实际上有很长的一段路要走。比如，如何保证模型生成的内容没有恶毒信息？以及如何把单纯completion的language model变成善于做对话格式的应用模型？这些内容和RL就很有关系（当然，还有一些普通的DL方法，比如supervised fine-tuning(SFT)，等等，这里就不再介绍）
119 | 
120 | 一个基本的思想是，我们把language model阐述为一个RL问题：state $s$ 是它获得的问题，而action是它给出的completion。reward如何获得呢？有一些机器的尝试，但人类还是必须要介入。一个重要的方法是**RLHF**(RL from Human Feedback)，也就是对于 $(s,a)$ 对，希望通过人类的某些反馈给出 $r(s,a)$ 。
121 | 
122 | 如何给出 $r(s,a)$ 呢？让人类输出浮点数，显然不现实；一个很好的方法是让人给出同一个state下面对两个action的**preference**。有了preference的统计，那么就可以建立一个模型来估计：
123 | 
124 | $$
125 | \text{Pr}(a_1\succ a_2|s) = \sigma(r_\phi(s,a_1)-r_\phi(s,a_2))
126 | $$
127 | 
128 | 其中， $r_\phi$ 是我们未知的reward模型；而 $a_1\succ a_2$ 代表给定state $s$ 下，人类认为回答 $a_1$ 比回答 $a_2$ 好的概率（当然，这需要很多人来评价，而不是让一个人给出这个概率！）。 $\sigma$ 可以就取为sigmoid函数。我们可以把它转化为一个objective：
129 | 
130 | $$
131 | \phi = \arg\max_\phi \mathbb{E}_{(s,a_1\succ a_2)}[\log \sigma(r_\phi(s,a_1)-r_\phi(s,a_2))]
132 | $$
133 | 
134 | （其中，数据集包含了大量人类评分。比如，对于某个state $x$ 下面有两个回答 $y_1,y_2$ ，数据集可能有 $(x,y_1\succ y_2),(x,y_1\succ y_2),(x,y_2\succ y_1),(x,y_1\succ y_2)$ 这四组数据。）可以论证，这个objective下最优的 $\phi$ 就是满足前面的关系。
135 | 
136 | 那么，采集到很多数据对，我们就可以训练出 $r_\phi$ 这个模型；这样，我们就能够把问题转化为RL问题了！我们可以使用policy gradient等方法进行估计。注意到这个过程只有一步，所以实际上就是
137 | 
138 | $$
139 | \nabla_\theta J(\theta)=\mathbb{E}_{(s,a)\sim \pi_\theta}[\nabla \log \pi_\theta(a|s)\cdot r_\phi(s,a)]
140 | $$
141 | 
142 | 但是，请注意！这并不行，为什么？因为这有点像offline RL，注意到reward从开始训练 $\theta$ 之后就再也不变，因此我们的model很容易就可以疯狂地exploit $r_\phi$ 的漏洞（人类没有label的数据上面 $r_\phi$ 可以任意地错）。因此，我们需要类似offline RL里面的AWAC方法，加入一个restriction：
143 | 
144 | $$
145 | J(\theta)=\mathbb{E}_{s,a\sim \pi_\theta}[ r_\phi(s,a)] - \beta\mathbb{E}_s[\text{KL}(\pi_\theta(a|s)||\pi_0 (a|s))]
146 | $$
147 | 
148 | 其中， $\pi_0$ 是没有开始RL之前的policy。在这一目标基础上，一个叫做PPO的算法进行了一些小的修改，并使用它来训练。实验上，在人类的preference上，使用RL的方法fine-tune出来5B的模型可以吊打原始的175B的大模型。
149 | 
150 | ### Can We Do Better?
151 | 
152 | 但前面的方法有一些问题。首先，我们先来训练一个reward模型，再用它来训练。注意到reward模型也必须是很大的transformer（否则完全无法抵挡RL中 $\pi_\theta$ 的猛烈“攻击”！）。因此，这样的工作量实际上很大。
153 | 
154 | 不仅如此，我们可以看到前面的objective中， $\sigma(r_\phi(s,a_1)-r_\phi(s,a_2))$ 这一项有很大的漏洞： $r_\phi$ 可以加上一个任意的复杂网络 $b(s)$ ，loss都不变！一种观点是，模型多余的自由度往往意味着训练更加困难。这可能也会造成一系列问题。
155 | 
156 | 为了应对他们，一个重要的思想诞生了——**DPO(Direct Preference Optimization)**。这一方法要说难其实也不难想到，但是就是一个巨大的成就。它主要的工作是做了一步代数变形。
157 | 
158 | 我们知道，像AWAC一样，前面加入KL divergence之后的objective具有close form的最优策略：
159 | 
160 | $$
161 | \pi_\theta(a|s)=\pi_0(a|s)\exp \left( \frac{1}{\beta}r_\phi(s,a)\right)
162 | $$
163 | 
164 | 人们想到，我们既然知道最优解，那这一步优化不如换成等价替换。这样，我们直接把policy代入原来的objective：
165 | 
166 | $$
167 | J_{\text{DPO}}(\theta)=\mathbb{E}_{(s,a_1\succ a_2)}\left[\log \sigma\left(\beta\cdot\left(\log \frac{\pi_\theta(a_1|s)}{\pi_0(a_1|s)} - \log \frac{\pi_\theta(a_2|s)}{\pi_0(a_2|s)}\right)\right)\right]
168 | $$
169 | 
170 | 这就是DPO的目标。可以看到，DPO基本一定优于前面的方法，因为它不仅只需要训练一个policy网络，而且还避免了 $r_\phi$ 存在漏洞的问题。
171 | 
172 | 一个更好玩的是，我们对DPO的objective计算梯度，也可以发现很有趣的意义。它是：
173 | 
174 | $$
175 | \nabla J_{\text{DPO}}(\theta)=\beta \mathbb{E}_{(s,a_1\succ a_2)}\left[p_{\theta}(a_2\succ a_1)\cdot [\nabla \log \pi_\theta(a_1|s)-\nabla \log \pi_\theta(a_2|s)]\right]
176 | $$
177 | 
178 | 其中
179 | 
180 | $$
181 | p_\theta(a_2\succ a_1)=\sigma\left(\beta\cdot\left(-\log \frac{\pi_\theta(a_1|s)}{\pi_0(a_1|s)} + \log \frac{\pi_\theta(a_2|s)}{\pi_0(a_2|s)}\right)\right)
182 | $$
183 | 
184 | 是model误判 $a_2$ 比 $a_1$ 好的概率。这很直观——我们总是让model尽量放大 $a_1$ 的概率，减少 $a_2$ 的概率；但我们让它最关注犯错的地方，而做对的地方可以少训练一些。
185 | 
186 | > 你也许会回忆起，你的高中（或者初中）老师也这样让你练习错题。你现在就会发现，他/她做的一切都是有深意的！
187 | 
188 | ## Multi-step Language Models
189 | 
190 | 我们最后来讨论一下RL和language model的进一步结合——chatbot 模型，也就是和人类能进行多轮对话的模型。如果机器每说一句话，人都给出一个评分，那这个问题就可以是一个RL的问题。
191 | 
192 | 我们应该采用何种RL方法呢？如果用policy gradients，因为它是on-policy的，因此我们可能需要大量地和人交互，这不一定可取。比较常见的是value-based method。此外，一定要注意，对话属于partially observe的问题，而一般采用的方法都是Method 3，即history states，把历史的对话cat起来作为state。
193 | 
194 | 一个有意思的问题是，我们如何定义一个timestamp？
195 | - 你可能认为这是显然的——一轮对话是一个timestamp。人类给出的初始prompt是 $o_1$ ，然后机器给出 $a_1$ ；人类返回的回答是 $o_2$ ，等等。这确实是常见的一种方法，称为**per-utterance** timestamp。但是，这意味着action space巨大！我们从未处理过如此大且discrete的action space的问题，可能遇到巨大挑战；
196 | - 另外一个选择是，我们把一个词当作一个timestamp，也叫做**per-token** timestamp。比如说，开始的system prompt是很多timestamp，但这中间机器只能作出“听”这个特殊的action；而接下来轮到机器说话的时候，它可以作出任意的词作为action，而受到的observation是特殊的“none” observation。以此类推。这样，action和observation的维度都不大；但对于比较长的对话，horizon也会特别长。
197 | 
198 | 我们分别介绍value-based方法在这两种情况下的应用。
199 | 
200 | ### Per-Utterance Timestamp
201 | 
202 | 对于per-utterance的问题，相对简单： $s$ 和 $a$ 都是话，所以我们可以使用两个（可共用参数）transformer，一个把 $s$ 来encode到latent，一个把 $a$ 来encode到latent，再在latent上面训练出Q 网络。在决策的时候，稍微采用一点技巧：我们再拿一个transformer过来，做autoregresive的生成并作beam search；但是，beam search的标准并非是模型输出的概率，而是从模型的输出概率中采样，但用Q function来做beam search。这样，就可以近似地取到Q function的argmax。
203 | 
204 | 当然，我们也可以使用actor-critic的方法，额外训练一个actor来给出 $\pi(a|s)$ 。这个actor可能是一个transformer。当然，各种细节肯定也很复杂。
205 | 
206 | ### Per-Token Timestamp
207 | 
208 | 对于per-token的问题，稍微有些复杂。如图，我们用一个transformer来记录历史上的全部observation中的信息；对于给定的state，我们transformer模型（图中的横长方形）输出一个对于next token的预测；只不过，这一预测不是probability而是Q值。这样，我们可以像普通的LM生成文字一样，选取argmax给出一个policy。
209 | 
210 | ![](./assets/21-2.jpeg)
211 | 
212 | 需要注意的一个细节是，在训练中，我们有
213 | 
214 | $$
215 | Q(s,a)\leftarrow r(s,a)+\gamma \max_{a'} Q(s',a')
216 | $$
217 | 
218 | 但是根据问题设定，只有在机器说完一句完整的话之后才获得reward。因此， $r(s,a)$ 实际上对于大部分(state,action)对而言都是0。这可能导致训练更为困难。
219 | 
220 | 当然，无论使用哪一种方法，我们都还有一个重要的事情需要注意：不论怎么说，训练过程是一个offline的过程（因为我们训练的trajectory都是从最原始的普通language model这一policy采样得到的）。因此，第十六讲中的很多**Offline RL方法**需要被使用，如CQL，IQL等等。顺便一提，如果把Q当作这个“新的”语言模型的logits，那么原先最后形式的[CQL penalty](./16-offline-RL_2.md#another-method-controlling-q-values)就是**cross-entropy loss**，也就是限制模型输出的Q值在原先的next-token上面不能太低。直观看，这也是合理的。
221 | 
222 | 最后，作为总结，multi-step的对话模型还是很open的问题，当下也没有一个明确的针对它的结论。这一领域往往是我们介绍的众多RL方法和DL共同塑成的产物。
223 | 
224 | # Reference Papers
225 | 
226 | 1. [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2008.02275)（最早的RLHF）
227 | 2. [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://arxiv.org/abs/2009.01395)
228 | 3. [Recurrent Experience Replay in Distributed Reinforcement Learning](https://arxiv.org/abs/1707.01495)（Historical state）
229 | 4. [Learning Cooperative Visual Dialog Agents with Deep Reinforcement Learning](https://arxiv.org/pdf/1703.06585)（Multi-step chatbot）


--------------------------------------------------------------------------------
/lecture/notes-zh/23-challenge.md:
--------------------------------------------------------------------------------
  1 | # Challenges, Open Problems, and Philosophy
  2 | 
  3 | 本讲没有什么特别多的知识性内容，主要是一些回顾、讨论和介绍，大家看个乐呵。如果不感兴趣，我们就可以在此结束CS285的旅程了！🎉🎉🎉
  4 | 
  5 | ## RoadMap
  6 | 
  7 | 二十二讲，我们见了很多——是时候把前面的知识总结起来了。在PPT上面，有一张整体的roadmap；而我也来画一张自己的（差不多的）roadmap。
  8 | 
  9 | ![](./assets/23-1.jpeg)
 10 | 
 11 | 大体上来说，知识分为三个部分：“传统”的RL方法，包括Behavior Cloning，Policy Gradients, Value-based methods, Model-based methods，以及它们衍生出的众多方法，还有exploration的基本方法。然后是一些新的问题陈述，比如不允许和环境交互的offline RL，不要求最优而是要求模拟人类的soft optimality RL，反过来从demo学会reward的IRL，以及transfer learning和meta learning。最后，是应用，这里举了一个代表性的例子——RL应用于LLM。当然，细节都没有在这张图上出现。
 12 | 
 13 | 学习了如此之多，也许我们会觉得我们可以解决很多RL问题了。但实际上，还有很多挑战我们会面对。它们一般分为两类：
 14 | 
 15 | - algorithm challenge:你的算法收敛吗？多长时间收敛？需要费力调超参吗？和环境进行多少次交互？又能不能很快generalize到新的MDP？……
 16 | - assumption challenge:你的算法符合实际吗？
 17 |     - 任何一个任务必须有人的介入，否则RL agent就不知道该做什么。但人的介入又多种方式：
 18 |         - 人设计一个reward function（比如`gym`的那些环境，都是人为设计的reward function）；
 19 |         - 人提供一些demo（比如behavior cloning）；
 20 |         - 还有我们从来没有讨论过的。比如，可不可以，人对于两段agent的demo给出preference？人类和动物都可以这样学习啊！
 21 |         - 再比如，给agent一些描述目标的语言和图像的信息？
 22 | 
 23 | 我们接下来对这些挑战一一来讨论。
 24 | 
 25 | ## Stability
 26 | 
 27 | 这不必多说——我们都知道，RL的算法是以不稳定著称的。比如，Q learning注定不能保证收敛；而policy gradient注定具有很大的方差。这不值得遗憾或惋惜——人生难道不是也是如此吗？
 28 | 
 29 | 但是，了解它们不稳定的根源是重要的。我们来回顾一下：
 30 | 
 31 | - Q learning：不稳定的来源是，fit Q值不一定能保证完全准确，而是做一个投影。这看似很小的bias导致模型无法肯定地收敛。
 32 | - Policy Gradient：和Q learning不同，policy gradient基于严格的数学推导给出的objective，因此理论上可以收敛，也没有bias；但是，它的方差很大，因为它是基于sample的。虽然可以增大batch size，使用baseline，但是理论上为了稳定还是需要exponentially large的sample数目（这个论证我们没有cover）。
 33 | - model-based：你可能会奇怪，为何model-based也不收敛？按照道理，建立model的过程是一个普通的DL中学习分布的问题，而建立model之后我们可以无限采样，因此也能保证收敛。但是，实际上的问题有些尴尬——没有policy就没办法采样，也没办法训练model。而且，我们希望model训练的transition和我们的policy是相关的。这就导致，我们在最后的算法中**交替训练model和policy**，从而导致了不稳定。
 34 |     - 更严重的一个问题我们之前也提到过，policy很强，因此policy喜欢“利用”model中不该有的极大值（"erroneous maxima"），然后把训练过程搞得一团糟。这就是“上梁不正下梁歪”，也是model-based带有一丝advarsial attack的成分造成的。
 35 | 
 36 | 那么，你可能会问，调节超参数有帮助吗？你会发现，似乎在作业里，我们基本没有调节超参数的过程，和DL形成很鲜明的对比。这是因为RL的超参数调节是一个非常困难的问题，超参数对算法的影响也不像DL里那样的明确。其实，归根结底还是因为，RL的算法大多数本身不收敛，因此超参数的调节实际上作为一种“修补”，试着把它“修”得收敛，这当然很困难。
 37 | 
 38 | > 这里也有一个side question：人们其实一直不知道，为什么DL的模型不会严重地overfit？因为DL的模型参数远多于数据量，按道理应该具有在训练数据集上很快overfit的能力。
 39 | >
 40 | > 有人说是因为模型的 inductive bias；但也没有人能在这上面明确的理解为什么。但可以想到，这等效的代表着，巨大的神经网络结合上SGD，可以起到某种魔法般的“约束”的作用。
 41 | >
 42 | > 那么，我们能否理解这样的“魔法”，并把它应用到RL里面，使得训练更加稳定？比如，如果我们把神经网络变得特别大，是否会有帮助？在自然语言处理领域，人们发现 Transformer 模型的参数量越大，能力就越强，完全看不到边界——这就是所谓的 **scaling law**。那么，在 RL 领域，是否存在这样的 scaling law 呢？什么样的模型能够做到呢？这些都是 open question，等待着人们的解答。
 43 | 
 44 | ## Sample Efficiency
 45 | 
 46 | 我们来比较一下RL不同方法中的sample efficiency，从小到大：
 47 | - **gradient free methods**：其实我们甚至没有介绍这一类方法，它们是RL最原始的尝试，不需要梯度下降。它们的思想是借鉴“自然选择”的理念，让多个agent优胜劣汰，不断“进化”。可以想象到，这种方法的sample efficiency是最低的。一个典型的量级（比如Cheetah环境）是 **$10^8$ 次交互**，也就是 $10^6$ 个episode。对应到真实时间，这大概是15天。
 48 | - **fully on-policy policy gradients**：最原始的policy gradient方法，比如A3C（这个我们也没有仔细介绍，它是将Actor Critic方法进行并行的方法，感兴趣可以参考[这篇论文](https://arxiv.org/abs/1602.01783)）。这个方法的量级大概是 **$10^7$ 次交互**，也就是 $10^5$ 个episode。但是，值的一提的是，它可以高效地并行，因此不一定需要花特别长的时间，只要你有充足的算力。
 49 | - **advanced policy gradients**：基于TRPO和PPO的方法可以减少算法的“on policy程度”，进而提高sample efficiency。这个方法的量级大概是 **$10^6$ 次交互**，也就是 $10^4$ 个episode。
 50 | - **replay buffer-based method**：比如说Q-learning或者SAC，这些off-policy的方法可以利用replay buffer，进而提高sample efficiency。这个方法的量级大概是 **$10^5$ 次交互**，也就是 $10^3$ 个episode。对于SAC等方法，一个典型的概念是，解决real world的任务（不是通过模拟器），只需要**几个小时**就够了。
 51 | - **model-based method**：这些方法的量级大概是 $10^4$ 次交互，也就是 $10^2$ 个episode。
 52 | - 还有一些我们没有介绍的，更专注于sample efficiency的方法的model-based方法。最强的方法可以在20分钟内完成一个真实世界的任务，比如开门。
 53 | 
 54 | 一个神奇的事情是，似乎每向下走一层，sample efficiency都提高了一个量级。这很利于我们对常见的算法需要的用时有一基础概念。
 55 | 
 56 | 当然，你也许会问，为什么我们还会prefer那些没有那么sample-efficient的方法呢？原因有很多，比如，那些方法也许可以高度并行；或者，它们需要更少的算力。比如，gradient-free method的计算量远远小于model-based方法，后者要存储两个神经网络在显存里并对它们做多轮梯度下降。
 57 | 
 58 | 不过不管如何，在和实际环境进行交互的问题里，我们还是更喜欢sample efficient的方法。
 59 | 
 60 | ## Generalization
 61 | 
 62 | 早在一开始的[Preliminaries](./0-preliminaries.md#from-the-results)，我们就从结果上对比了RL和DL的一个重要区别。但是，当时我们还不太清楚RL究竟在干什么。现在，我们回想一下我们处理RL问题的经历，会发现，DL和RL从结果上来看的巨大差异还包括另外一点：
 63 | - DL的模型虽然只是为了拟合数据，但它们的评估方法**强调generalize**。没有人关注你的模型在训练集的loss是0.01还是0.0001，人们只看你的模型在测试集上的表现。
 64 | - 而另一方面，虽然我们说RL的问题处理的任务更加困难，没有标准答案，但是RL的评估标准强调**mastery**，也就是把一件事情做的多好。从经历上我们也可以发现，我们只关心agent能在环境里能刷到多少cumulative reward。
 65 | 
 66 | 有人会问，RL的generalization呢？我们不得不承认，我们很难把一个MDP上面的模型应用到另外一个MDP上。为什么会这样？
 67 | 
 68 | 有人说，我们加大数据的量！但是在RL里这只不过是一个空的口号，因为数据是和环境交互产生的，是我们自己收集的。也就是RL的这一问题陈述的根基决定了我们很难像DL那样，通过增大数据量来提高generalization，如图所示。
 69 | 
 70 | ![](./assets/23-2.png)
 71 | 
 72 | 从另一个角度看，我们也并不是什么办法都没有。比如，对于**offline RL**的研究使得我们可以重复利用旧的数据。没准在不久的将来，人们就可以将各个MDP的数据放在一起也形成一个像ImageNet或者Common Crawl这样的数据集，并从上面用offline RL训练出来一个通用的agent。当然，在现在，这一问题还远超我们的预期。
 73 | 
 74 | 再比如，就如我们在第22讲介绍的那样，transfer learning和meta learning也许可以帮助我们解决这一问题。它们作为新兴的领域，也正在高速地发展并取得成就。
 75 | 
 76 | 但有人也许会说，我们应该知足了——一个MDP在RL里就是一个任务，你要求在MDP上还能generalize，就好像要求NLP的模型generalize到CV上一样。这也许有些道理，但作为有着信仰的研究者，我们应该意识到RL背负着比DL更深刻的重任：它的目标最终是通向一个像人类一样的智能体。事实证明，我们还有很长的路要走：现在，我们可以用6小时的训练让一个机器人上学会奔跑，但只能在一个**无限大的平面上**。真实世界是如此复杂，脚下有各式各样的地貌，上空有风吹雨淋，这一切都似乎不影响一个几个月的婴儿学会走路。什么时候，我们的agent才能到达这里呢？
 77 | 
 78 | ## About Assumptions: How Humans Get Involved?
 79 | 
 80 | 我们再来讨论如何人类介入RL的问题。对于**reward**这一介入方式，我们已经有了很多的研究。但是我们也清楚，很多 human preference 无法用单一的 reward 来刻画。
 81 | 
 82 | > **Condorcet paradox**: 假设三个人的 preference 分别是 $A<B<C$, $B<C<A$, $C<A<B$，那么就会出现：
 83 | >
 84 | > 在 $A,B$ 面前，更多人选择 $B$;     
 85 | > 在 $B,C$ 面前，更多人选择 $C$;     
 86 | > 在 $C,A$ 面前，更多人选择 $A$。
 87 | 
 88 | 除了reward之外，demo是一个很好的方法。不仅是behavior cloning，我们甚至还可以通过某些方式像“老师教学生”一样，让人类教agent一些关键技巧，再令agent进行应用。比如，[这篇论文](http://www.jenskober.de/publications/Muelling2012AAAI.pdf)就用此方法让机器人学会打乒乓球。
 89 | 
 90 | 另外可能的方向是用语言做prompt。比如，[这里](https://arxiv.org/abs/1711.00482)就有一个例子。这可以让agent和人类的交互更贴近“合作者”的关系，更加自然。
 91 | 
 92 | 此外，preference也是很好的思路。[这篇论文](https://arxiv.org/abs/1706.03741)展示了一个例子，人类通过不断评价左右两段轨迹哪个更好，最终使得agent能够完成后空翻（想一想你能否设计出来一个后空翻的reward函数！）。可以观看这一[视频](https://youtu.be/WT0WtoYz2jE?t=68)，看agent是如何学会后空翻的。
 93 | 
 94 | 这也启示我们：RL的研究并不只是局限在最原始的问题的解决算法上；一个很重要的方向就是，如何提出一个新的问题陈述，使得它和实际更加贴近，更加有意义。
 95 | 
 96 | ## Philosophy: What is Reinforcement Learning, anyway?
 97 | 
 98 | 我们当然知道什么**是** RL。但是，RL**对于我们**是什么东西？
 99 | - 它仅仅是一种我们用来解决问题的**工具**，就像计算机、数学和物理一样？
100 | - 还是，它是一个模型，**描述**了人类、动物的学习方式？
101 | - 还是说，它其实就是“学习”这一现象作为一门“学科”的**基本规律**，就像物理定律描述着宇宙的基本动力学规律一样？（我们知道，物理学是不描述人类的“意识”的；也许描述“意识”，或者“学习”这一个“意识”的子集的，就是RL？）
102 | 
103 | 可以说，每个人会有不同的看法。CS285的instructor Sergey Levine曾说过，他自己就坚持第三种观点。当然，我在这里也并不是要推销任何的观点，所以让我们看一看坚持三种观点的人都会说些什么。
104 | 
105 | （以下纯属虚构！）
106 | 
107 | ### RL as a Engineering Tool
108 | 
109 | 小A就是持这一观点的人。
110 | 
111 | > “所谓RL，不过是干了那些人类懒得干，或者不会干的事情。比如，我们可以用几千个方程或者建模描述火箭，描述飞机和汽车。我们也可以通过物理学的分析来控制它们到达目的地。但是，对于一个走在路上的机器人，这个物理对于人类还是太复杂了，或者我们懒得做这个建模。RL很好啊——它减轻了我们的工作，人们付出智慧就可以了，计算的体力活可以交给GPU来做。”
112 | 
113 | > “换句话说，RL是一种技巧，把原先在纸上推导的复杂公式变成在电脑的内存里的gradient descent。它也没有那么高大上。你看，原来我们做的是：
114 | > 
115 | > - 建模；
116 | > - 推导公式，求解方程，进而根据环境进行控制；
117 | > 
118 | > 而现在，我们做的是：
119 | > 
120 | > - 建模；
121 | > - 构造一个simulator，契合RL的要求；
122 | > - 跑一个RL算法，让它进行控制。
123 | > 
124 | > 它们确实没有什么特别根本的差别。”
125 | 
126 | ### RL as a Model of Learning
127 | 
128 | > “当前的RL的发展还不太够。当我们的算法足够强大的时候，RL就应该能描述人类的学习方式”，小B说。
129 | 
130 | > “根据RL的问题陈述和定义，RL是描述人类学习原理的*唯一候选人*。我们也已经做了很多——RL算法可以在围棋和DOTA上击败人类，可以像人一样驾驶车辆。”
131 | 
132 | > “你也许会说，不对啊，人类生下来几个月就会爬行，但这对于RL可能很困难；相反，对于人类复杂抽象的国际象棋问题，RL却最早地解决了。你会因此怀疑，RL并非真正描述了人类的学习？因为人类最自然最擅长的那些——识别人脸，拿起一个物体，在混乱的人群中移动自己的身体来躲避碰撞——对于RL都是困难的。”
133 | 
134 | > “但是，这实际上是著名的**Moravec's Paradox**。你说的这些例子，不是说"RL连最简单的事都做不到"，而是你搞错了——这些实际上都是很难的问题。**只不过是因为，自然选择带来的幸存者偏差，我们人类甚至地球上的绝大部分生物都擅长这些事情**。人类可以不会开车，不会下围棋和国际象棋；但如果我们不善于移动自己的身体，不善于辨别环境的危险，我们就会被其他的，更擅长这些的生物所取代。”
135 | 
136 | > “如果总结一下，就是说：
137 | > - 人类极其善于处理corner case，或者未经历过的事情，比如走路的时候即是被绊也很容易恢复平衡；
138 | > - 但是RL对于未见的环境表现不一定好，因此现在我们还未达到人类的水平；
139 | > - 但从另外一个方面来讲，从RL的定义来看，解决这一问题，只有RL才有可能做到。
140 | > - 而我们在实验中很少这么做！
141 | > 
142 | > 所以，别继续在Cheetah和Humanoid上面刷分了！想一想，我们如何才能**generalize**，才能让agent迅速**适应**新的环境？”
143 | 
144 | 此外，一些客观的事实：带着这一思想，很多人进行了让RL在**真实的**环境中的尝试。什么叫做“真实的”环境？不是指不用simulator，而是我们试图刻画**复杂环境（即人类生活的环境）或者人类学习中的特征**。比如：
145 | - 在真实环境中，我们不能reset。
146 |     - 本来要接咖啡，结果咖啡泼到了桌子上，不能reset，而是要agent自己想办法。怎么办？[研究者](https://arxiv.org/abs/2104.11203)（注：下面的例子不完全是论文里的例子）想到了，我们（比如说）训练三个任务：
147 |         - 接咖啡；
148 |         - 把地上的咖啡杯捡起来；
149 |         - 把洒出去的咖啡擦干净。
150 |     - 它们的特点是，任何一个任务失败了，都可以练习其他的任务！这就很合理——比如，在你做化学实验之前，你的老师一定会告诉你如何处理中毒，腐蚀等问题，并确保你会处理这些情况才让你开始实验。这里，机器人很有可能也先学会捡起咖啡杯，最后才学会接咖啡。
151 | - 人类在学习一个任务的时候，并非从random exploration开始。
152 |     - 比如，当你第一次学习游泳，你就算不会任何泳姿，你也不应该让你的每一个关节以各向同性均匀高斯分布来转动和发力。如果你有这样的经历，你会发现你实际上大概会在水里“扑腾”，虽然用的力不一定在这一新的环境里特别有效，但至少你**recall了过去的经历**，并利用了它们。
153 |     - 因此，[这篇论文](https://arxiv.org/abs/2011.10024)就做了这样的尝试。它让机器人在开始训练之前有一个prior，里面的action不一定有利于完成任务，但至少是有意义的（比如，随机的动作就属于没有意义的）。实验上可以发现，一开始在这样的action prior中sample并explore，可以让学习的过程变得更加高效。
154 | 
155 | > “是的！”，小B说：“只有我们RL训练的环境越来越真实，我们才能期待RL agent进行emerge，最后出现我们认可的‘智能’”。
156 | 
157 | ### RL as the Universal Learning Method
158 | 
159 | > “我们可以把RL理解为一种根本性、实质性的理论。”，小C说。
160 | 
161 | > “如同研究神经科学的生物学家Daniel Wolpert所说，‘我们拥有大脑只为了一件事——那就是**作出复杂的且适应性强**的动作’。我们也可以把它搬到learning这一领域：我们需要学习，只为了作出复杂的且适应性强的**决策**。”
162 | 
163 | > “你也许会说，在DL里，我们并没有做决策啊？但DL难道不是一种学习吗？但是，实际上你想错了。DL的模型也在做决策——并不是说它们决策图片的label是什么，而是它们决定图片的label之后，发生了什么。比如，它受到了cross entropy loss的惩罚，它才根据这个惩罚来调整自己的决策。”
164 | 
165 | > “在DL里，我们通常在很大但质量不高的‘垃圾’数据集上做pretrain，而在少数高质量的数据集上finetune。你会发现，其实垃圾的数据集就像是环境，我们从中不学习决策，只是学习dynamics；而高质量的数据集如同human给出的指示，比如reward，我们在上面利用过去的知识学会任务。”
166 | 
167 | > “我的梦想是，有朝一日我们会有一个world dataset，包含了世界上所有有意义的行为。比如，做饭炒菜，扫地，开车，甚至是写代码，都在里面。当然，对于一个指定的task，这里面的大部分数据都属于垃圾；但我们的approach是，我们在这个world dataset上pretrain，做unsupervised的offline RL，然后让agent和真正目标的环境进行交互，完成downstream task。”
168 | 
169 | 事实上，也有一系列研究关注这一问题，它们主要都关注于如何对大的数据集进行unsupervised learning。比如，之前提到的，我们可以做goal conditioned learning；或者，可以让模型自己生成一些有意义的数据，然后把它们当作demo来学习（这篇[文章](https://arxiv.org/abs/2311.05584)）。
170 | 
171 | ## The Final Question, and the Future
172 | 
173 | 我们最后来思考这样一个问题：RL为什么能学会？我们又为什么能学会？
174 | 
175 | 后者很难回答，只不过（也许）这不是搞AI的人们特别要担心的事情。但是，我们的学习方式大概可以归结为两个方面：
176 | - imitation：人类生活在社会里，在遇到未接触的事物时，我们倾向于会模仿他人的行为，从而避免一开始就犯错误——毕竟，人生无法reset。
177 | - understanding：这是更难刻画的了。我们还不知道我们的大脑如何完成图像识别，如何理解语言，更不知道大脑如何控制我们的行为。但是，通过某种方式，我们随着对问题的理解，可以理解他人的意图，而逐渐脱离简单的模仿，来拥有自己的价值观和理念（**Value**）。
178 | 
179 | 前者也难回答。为什么DL能学会？我们甚至答不出来。但是，如果把DL当作一个oracle，那么我们就可以大致地回答：Deep Reinforcement Learning能学会，是因为RL算法可以做decision making，而DL的大模型保证了end-to-end training可以work。我们可以进一步归结到几个方面：
180 | - unsupervised learning：在和环境（或者更准确地说，reward）交互之前，我们可能通过offline的数据集进行goal conditioned之类的学习。
181 | - model learning：我们建立对于环境的模型，这某种程度上也模拟了人类的行为（猜测别人的心思，或者预测可能的后果）。
182 | - policy learning：看似reward只是一个标量的数值，给出的信息远不如DL中图片/label那样丰富；但是，RL的算法实际上带来了巨量信息。比如，就想一想Value backup，它能某种程度上“打通”整个MDP的结构。
183 | 
184 | 我们该究竟如何尝试回答这两个问题呢？也许，现在还没有人知道。如同Sergey Levine教授说的那样，我们应该“Think Big and Start Small”。相信最终有一天，人们会给出一个令人信服的答案。


--------------------------------------------------------------------------------
/lecture/notes-zh/3-pytorch.md:
--------------------------------------------------------------------------------
 1 | # For a full pytorch tutorial, see [notebook](./CS_285_Fa23_PyTorch_Tutorial.ipynb)
 2 | 
 3 | # Detach
 4 | 
 5 | - `t.detach()` return a tensor which is detached from the computation graph. However, this tensor is a reference to the original tensor `t`.
 6 | - just calling `detach()` won't destroy the computational graph.
 7 | 
 8 | ```python
 9 | x = torch.tensor([1.,2.],requires_grad=True)
10 | xfang = x * x
11 | xlifang = x * xfang
12 | xfang_detached = xfang.detach()
13 | loss = xlifang.sum()
14 | loss.backward()
15 | print(x.grad) # Not None
16 | ```
17 | 
18 | # Clone
19 | 
20 | - If you want to mutate `t` after detaching it from the graph, you should use `t.detach().clone()`, so that the mutation won't affect `t` in the graph.
21 | 
22 | # Backward
23 | 
24 | - Can backward twice for one leaf tensor `x`, but can't backward for one non-leaf tensor `y` twice. For example, this is possible
25 | ```python
26 | x = torch.tensor([1.,2.],requires_grad=True)
27 | y = (x * x).sum()
28 | z = (x * x).sum()
29 | y.backward()
30 | z.backward()
31 | ```


--------------------------------------------------------------------------------
/lecture/notes-zh/4-intro2RL.md:
--------------------------------------------------------------------------------
  1 | # Markov Decision Process (MDP)
  2 | 
  3 | ## Basic Concepts
  4 | 
  5 | **Markov Chain**：
  6 | 
  7 | $$
  8 | \mathcal{M}=\{S,T\}
  9 | $$
 10 | 
 11 | 其中， $S$ 被称为**state space**； $T$ 被称为**transition matrix**。 我们可以写出稳态的分布：
 12 | $$
 13 | p_{t+1}(s')=\sum_{s\in S}T(s',s)p_t(s)
 14 | $$
 15 | 
 16 | 这就是一个简单的矩阵乘法。
 17 | 
 18 | **Markov Decision Process**：
 19 | 
 20 | $$
 21 | \mathcal{M}=\{S,A,T,r\}
 22 | $$
 23 | 
 24 | 其中， $S$ 是**state space**， $A$ 是**action space**， $T$ 是**transition matrix**。我们有
 25 | 
 26 | $$
 27 | p_{t+1}(s')=\sum_{s,a}T(s',s,a)p_t(s)p^{(A)}_t(a)
 28 | $$
 29 | 
 30 | 这里 $p^{(A)}_t(a)$ 代表在时刻 $t$ 来take action $a$ 的概率。
 31 | 
 32 | 
 33 | $r$ 是 **reward function**，对于任意(state,action)对，它给出
 34 | 
 35 | $$
 36 | r(s,a)\in \mathbb{R}
 37 | $$
 38 | 
 39 | 在 **Partially Observed MDP (POMDP)** 中，我们还必须引入**observation** $o_t$ 。但在接下来，我们均不考虑这一情况。
 40 | 
 41 | ## Goal
 42 | 
 43 | 我们的目标是最大化 **cumulative reward**:
 44 | $$
 45 | \theta^\star=\arg\max_\theta \mathbb{E}_{\tau\sim p_\theta(\tau)}\left[\sum_{t=0}^T r(s_t,a_t)\right]
 46 | $$
 47 | 
 48 | 其中 $\tau$ 代表轨迹 $(s_0,a_0,s_1,a_1,\ldots)$ , $p_\theta(\tau)$ 代表在策略 $\pi_\theta$ 下得到该轨迹的概率。
 49 | 
 50 | 我们还希望把这一表达式扩展到 $T=\infty$ 。(注意现在的情况中有些问题，因为 $T\to \infty$ 时 $\tau$ 的probability space趋于无限大。) 为此，定义 $p_{\theta}(s_t,a_t)$ ，它是在时间 $t$ 获得 $(s_t,a_t)$ 的概率。 （注：参见第二讲**Notation**部分的重要注意事项， $p_\theta$ 对于不同的 $t$ 不是一个分布）
 51 | 
 52 | 我们有
 53 | $$
 54 | p_{\pi_\theta}((s_{t+1},a_{t+1})|(s_t,a_t))=p(s_{t+1}|s_t,a_t){\pi_\theta}(a_{t+1}|s_{t+1})
 55 | $$
 56 | 
 57 | 因此，可以重写goal为
 58 | $$
 59 | \theta^\star=\argmax_\theta \sum_{t=0}^T\mathbb{E}_{(s,a)\sim p_{\pi_\theta}(s_t,a_t)}\left[ r(s,a)\right]
 60 | $$
 61 | 
 62 | 这以表达就很容易扩展到 $T\to \infty$ 的时候了。
 63 | 
 64 | 
 65 | ## Value Function and Q Function
 66 | 
 67 | Value function是一个很重要的概念。它定义了一个state的价值，即在这个state下，我们可以获得的最大reward。我们可以定义**Value Function**为
 68 | 
 69 | $$
 70 | J=\sum_{t=0}^T\mathbb{E}_{(s_t,a_t)\sim p_{\pi_\theta}(s_t,a_t)}\left[ r(s_t,a_t)\right]
 71 | $$
 72 | 
 73 | $$
 74 | =\mathbb{E}_{s_0\sim p(s_0)}\left[\mathbb{E}_{a_0\sim \pi_\theta(a_0|s_0)}\left[r(s_0,a_0)+\sum_{t= 1}^T\mathbb{E}_{(s_t,a_t)\sim p_{\pi_\theta}(s_t,a_t|s_0,a_0)}\left[ r(s_t,a_t)\right]\right]\right]
 75 | $$
 76 | 
 77 | 我们也可以从中定义著名的**Q-Function**：
 78 | 
 79 | $$
 80 | Q^{\pi_\theta}(s_t,a_t)=r(s_t,a_t)+\sum_{i={t+1}}^T\mathbb{E}_{(s_i,a_i)\sim p_{\pi_\theta}(s_i,a_i|s_{t},a_{t})}\left[ r(s_i,a_i)\right]
 81 | $$
 82 | 
 83 | （注意，这里还是一样的问题：如果 $T$ 有限，那么 $Q^{\pi_\theta}(\cdot,\cdot)$ 对于不同的 $t$ 很可能不是一个函数，但这一点从记号上没有显示出来。可以发现很多RL的记号都存在这种问题，需要自己意会。）
 84 | 
 85 | 和**Value Function**：
 86 | 
 87 | $$
 88 | V^{\pi_\theta}(s_t)=\mathbb{E}_{a_{t}\sim \pi_\theta(a_t|s_t)}\left[Q^{\pi_\theta}(s_t,a_t)\right]
 89 | $$
 90 | 
 91 | 这样我们的目标就变成了
 92 | 
 93 | $$
 94 | J=\mathbb{E}_{s_0\sim p(s_0)}\left[V^{\pi_\theta}(s_0)\right]=\mathbb{E}_{(s_0,a_0)\sim p_{\pi_\theta}(s_0,a_0)}\left[Q^{\pi_\theta}(s_0,a_0)\right]
 95 | $$
 96 | 
 97 | 此外，我们还有一个重要的，联系 $Q$ 和 $V$ 的关系：
 98 | 
 99 | $$
100 | Q^{\pi_\theta}(s_t,a_t)=r(s_t,a_t)+\mathbb{E}_{s_{t+1}\sim p(s_{t+1}|s_t,a_t)}\left[V^{\pi_\theta}(s_{t+1})\right]
101 | $$
102 | 
103 | 这使得我们可以使用**Dynamic Programming**的方法计算 $Q$ 和 $V$ 。之后，我们会讨论这一方法。
104 | 
105 | 
106 | ### Planing with Q and V
107 | 
108 | $Q,V$ 的重要性在于，它们可以很好地表达出我们的goal。换句话说，如果我们有了 $Q,V$ ，我们就可以优化policy。
109 | 
110 | 比如说，如果我们有 $Q^{\pi}(s,a)$ ，我们就有一个最好的策略：
111 | 
112 | $$
113 | \pi(a^\star,s)\leftarrow 1, a^\star=\argmax_a Q^{\pi}(s,a)
114 | $$
115 | 
116 | 除此之外，由于 $V$ 是 $Q$ 的期待值，我们也就知道我们的policy应该选择一个好的 $a$ ，使得 $Q(s,a)\ge V(s)$ 。这些直觉是很重要的，之后会讨论。
117 | 
118 | 
119 | # RL Algorithms Overview
120 | 
121 | 所有的RL算法都遵循这张图的结构：
122 | 
123 | ![](./assets/4-1.png)
124 | 
125 | "Generating Samples" 通常是比较容易的，因为只需要环境完成。最关键的部分是绿色和蓝色的部分：**Reward Evaluation** 和 **Policy Improvement**。
126 | 
127 | 下面是一些常见的RL算法：（我们会在接下来的几讲介绍它们）
128 | 
129 | | Algorithm | Reward Evaluation | Policy Improvement |
130 | | --- | --- | --- |
131 | | Policy Gradients| $J$ is sum of rewards | Gradient Descent on $J$ |
132 | | Value-based | Learn $V,Q$ of the **optimal policy** | Improve policy using $V,Q$ (with the intuition discussed above)|
133 | | Acter-Critic | Learn $V,Q$ of the **current policy** | Improve policy using $V,Q$ |
134 | | Model-based | Learn $f_\phi$ , $s_{t+1}=f_\phi(s_t,a_t)$ (Simulate the env) | Backprop onto $\pi_\theta$ using $f_\phi$ |
135 | 
136 | ## Tradeoff
137 | 
138 | - **sample efficiency**: 
139 |     - 这也被称为是否 "off-policy" ？"off-policy" 指的是，就算policy改变了，我们也可以利用之前的数据。这样的算法更加sample efficient。
140 |     - 我们什么时候要care sample efficiency呢？可以发现，如果我们的环境是一个真实的物理环境，那么sample efficiency就很重要了。但如果我们的环境是一个模拟环境，我们就不用管这个。
141 |     - 可以发现，Policy Gradient 方法是 on-policy 的：当 policy 改变的时候，我们需要重新获取若干 sample 才能正确获取 reward 的梯度。反之，Value-based 和 Model-based 方法则是 off-policy 的，因为可以利用之前的训练数据，故有更高的 sample efficiency。
142 | - **wall clock time**:这是计算复杂度的一个简称。如果我们的模型要高频地与环境交互，那么计算复杂度就很重要了。
143 | - **stability**:这是指算法的稳定性。policy gradient作为一个梯度下降算法，总是可以保证convergence；但其余不采用 GD 的 RL 方法，比如 Value function fitting，在最坏情况下没有任何的理论收敛保证。
144 | 
145 | ## Common Assumptions
146 | 
147 | - **Full Observability**. 经常出现在 value function fitting 方法中。假设模型能够看到环境中的一切信息，i.e. $s_t=o_t$. 可以用循环模型架构缓解 partial observation。
148 | - **Episodic Learning**. 经常出现在纯粹的 policy gradient，以及一些 model-based 方法中。这意味着 agent 与环境的交互由一个个 episode 组成，每个 episode 有始有终，并有明确的目标。
149 | - **Continuity or Smoothness**. 在 model-based 方法中经常出现。


--------------------------------------------------------------------------------
/lecture/notes-zh/5-policy_grad.md:
--------------------------------------------------------------------------------
  1 | # Policy Gradient Algorithm
  2 | 
  3 | 我们直接优化目标
  4 | 
  5 | $$
  6 | J(\theta)=\mathbb{E}_{\tau\sim p_{\pi_\theta}(\tau)}\left[\sum_t r(s_t,a_t)\right]
  7 | $$
  8 | 
  9 | 我们计算梯度，并使用一个小的数学技巧：
 10 | 
 11 | $$
 12 | \nabla_\theta J(\theta)=\nabla_\theta\left[\int p_{\pi_\theta}(\tau)\sum_t r(s_t,a_t)d\tau\right]
 13 | $$
 14 | 
 15 | $$
 16 | =\int p_{\pi_\theta}(\tau) \nabla_\theta \log p_{\pi_\theta}(\tau)\sum_t r(s_t,a_t)d\tau
 17 | $$
 18 | 
 19 | $$
 20 | =\mathbb{E}_{\tau\sim p_{\pi_\theta}(\tau)}\left[\nabla_\theta \log p_{\pi_\theta}(\tau)\sum_t r(s_t,a_t)\right]
 21 | $$
 22 | 
 23 | $$
 24 | =\mathbb{E}_{\tau\sim p_{\pi_\theta}(\tau)}\left[\left(\sum_t \nabla_\theta\log \pi_\theta(a_t|s_t)\right)\left(\sum_t r(s_t,a_t)\right)\right]
 25 | $$
 26 | 
 27 | 当然，在实际上，我们一般通过采样 $N$ 条轨迹来估计梯度：
 28 | 
 29 | $$
 30 | \nabla_\theta J(\theta)\approx \frac{1}{N}\sum_{n=1}^N \left(\sum_t \nabla_\theta\log \pi_\theta(a_t|s_t)\right)\left(\sum_t r(s_t,a_t)\right)
 31 | $$
 32 | 
 33 | 这一算法也叫做**REINFORCE** algorithm。 它包含三步（就像第四讲提到的那样）：
 34 | 
 35 | 1. Sample $N$ trajectories $\tau_n$ by $\pi_\theta$ .
 36 | 2. Compute the gradient of $J(\theta)$ approximately
 37 | 3. Update $\theta$ by gradient descent.
 38 | 
 39 | *Comments.* 直观上看，对于使得reward $\sum r(s_t,a_t)$ 很大的 $a_t$ ，梯度 $\nabla \log \pi_\theta(a_t|s_t)$ 的"learning rate"更大，所以模型会倾向于把更大的概率给这个action。
 40 | 
 41 | 当只有 partial observability 时，也可以直接修改 $J(\theta)$ 的梯度为
 42 | $$
 43 | \nabla_\theta J(\theta)\approx \frac{1}{N}\sum_{n=1}^N \left(\sum_t \nabla_\theta\log \pi_\theta(a_t|o_t)\right)\left(\sum_t r(s_t,a_t)\right).
 44 | $$
 45 | 
 46 | ## Issues with the vanilla Policy Gradient
 47 | 
 48 | 这个算法听起来很直观并且很好实现，但是实际上会发现有一定的问题：**reward出现的方式不对**。
 49 | 
 50 | 设想现在我们把每一个reward加一个常数，这个体系本身不应该受影响；但现在的形式中reward作为一个类似于"learning rate"的形式乘在上面，这就导致了可能的问题。
 51 | 
 52 | - 举个例子：假设现在我们取3条轨迹，他们的reward分别为-2,1,1，那么（粗略来说）第一条的概率会下降，第二、三条的概率会上升。但是假设现在reward全局增加一个常数，导致它们变成了100,103,103,那么可以想到它们三条轨迹的概率都会上升，这明显是不对的（因为第一条轨迹在取样的三条中都是最差的，很难说他是一条很好的轨迹）！
 53 | - 但是另一方面，如果样本足够多( $N\to\infty$ )，根据理论我们知道整体reward的平移不会影响policy的gradient。
 54 |     - 注意上面的解释只不过是intuition；假设现在我们每一个reward都加了一个100导致全部是正的了，这并不会导致所有概率都增加（因为最后的梯度并不是传到概率上，而是传到概率分布的parameters上！）。可以想象，如果所有reward都等于100，那么就像所有都是0一样不会有任何update发生。
 55 |     - 因此，这实际上是policy gradient的一个典型问题：**估计本身是unbiased的，但在sample数量不够的时候会产生很大的variance，从而导致估计具有很大的bias**。
 56 | 
 57 | 为了解决这个问题，引入了重要的两个方法：**baseline**和**causality**。
 58 | 
 59 | ### Baseline
 60 | 
 61 | Baseline直接对上面的问题开刀：既然reward上叠加的uniform constant不影响gradient，那我们不妨修改一下梯度的表达式:
 62 | 
 63 | $$
 64 | \nabla_\theta J(\theta)\approx \frac{1}{N}\sum_{n=1}^N \nabla_\theta\log p_{\pi_\theta}(\tau_n)\left(r(\tau_n)-b\right)
 65 | $$
 66 | 
 67 | 这里的 $b$ （称为**baseline**）应该是任意的，因为
 68 | $$
 69 | \mathbb{E}_{\tau\sim p_{\pi_\theta}(\tau)}\left[\nabla_\theta\log p_{\pi_\theta}(\tau)\right]=0.
 70 | $$
 71 | 注意到这里的 $b$ 可以任意选取，所以我们可以选取使得方差变小的 $b$，比如
 72 | $$b=\dfrac{1}{N}\sum\limits_{i=1}^N r(\tau_n).$$
 73 | 如果直接令 $\nabla_\theta J(\theta)$ 的方差最小化，可以解得
 74 | $$
 75 | b=\dfrac{\mathbb{E}[(\nabla_\theta \log p_{\pi_\theta}(\tau))^2r(\tau)]}{\mathbb{E}[(\nabla_\theta \log p_{\pi_\theta}(\tau))^2]}.
 76 | $$
 77 | ### Causality
 78 | 
 79 | Causality关注到了一个不易察觉的问题：我们原来的表达式其含义是， $\pi_\theta(a_t|s_t)$ 的梯度依赖于整个过程的reward大小 $\sum_{t}r(s_t,a_t)$ ；但实际上这并不合理——根据因果性， $t$ 时刻的决策不会影响 $t'<t$ 时刻的reward。因此，即使前面几步走的比较差，你不能让后面走的很好的步骤也蒙受这个惩罚。
 80 | 
 81 | 按照这个思路，我们直接改写
 82 | 
 83 | $$
 84 | \nabla_\theta J(\theta)\approx \frac{1}{N}\sum_{n=1}^N \sum_{t=1}^T \nabla_\theta\log \pi_\theta(a_t|s_t)\left(\sum_{t'=t}^T r(s_{t'},a_{t'})\right)
 85 | $$
 86 | $$
 87 | =\frac{1}{N}\sum_{n=1}^N \sum_{t=1}^T \nabla_\theta\log \pi_\theta(a_t|s_t)\hat{Q}^{\pi_\theta}_{n,t}
 88 | $$
 89 | 
 90 | 其中的 $\hat{Q}^{\pi_\theta}_{n,t}$ 有点类似于Q-function但并不是——它是依赖于路径的，“未来所有reward之和”。
 91 | 
 92 | 数学上，也可以直接证明这样的causality表达式和原先的表达式在 $N\to\infty$ 时是等价的，因为
 93 | 
 94 | $$
 95 | \mathbb{E}_{\tau\sim p_{\pi_\theta}(\tau)}\left[\nabla \log \pi_\theta(a_t|s_t)\left(\sum_{t'<t}r(s_{t'},a_{t'})\right)\right]=0
 96 | $$ 
 97 | 
 98 | 只不过，我们用取样 $N$ 次来估计期望，所以这些看似是 $0$ 的项变成非 $0$ 的了，会贡献一部分方差。将这些部分去掉就能显著减少训练方差，稳定训练。
 99 | 
100 | # Off-Policy Policy Gradients
101 | 
102 | 为了计算 $J$ 的梯度，我们需要取样 $\tau\sim p_{\pi_\theta}(\tau)$，所以 policy gradient 是 on-policy 的，造成**采样效率**的问题。
103 | 
104 | 当然，一般policy gradient都是用在sample efficiency不需要特别考虑的地方，但是我们还是讨论一类特别的方式，把off-policy的思想引入policy gradient。
105 | 
106 | 关键在于采用 **Importance Sampling**。假设现在我们有 $p_{\pi_{\bar{\theta}}}$ 这个分布中取样的若干样本，那么我们就可以用importance sampling利用这些样本来估计 $p_{\pi_\theta}$ 分布中某些东西的期望。具体地，我们写出
107 | 
108 | $$
109 | \nabla_{\theta}J(\theta)=\mathbb{E}_{\tau\sim p_{\pi_\theta}(\tau)}\left[\nabla_{\theta}\log p_{\pi_\theta}(\tau)\sum_t r(s_t,a_t)\right]
110 | $$
111 | 
112 | $$
113 | =\mathbb{E}_{\tau\sim p_{\pi_{\bar{\theta}}}(\tau)}\left[\frac{p_{\pi_\theta}(\tau)}{p_{\pi_{\bar{\theta}}}(\tau)}\nabla_{\theta}\log p_{\pi_\theta}(\tau)\sum_t r(s_t,a_t)\right]
114 | $$
115 | 
116 | $$
117 | =\mathbb{E}_{\tau\sim p_{\pi_{\bar{\theta}}}(\tau)}\left[\prod_{t=1}^T\frac{{\pi_\theta}(a_t|s_t)}{{\pi_{\bar{\theta}}}(a_t|s_t)}
118 | \left(\sum_{t=1}^T \nabla_{\theta}\log \pi_\theta(a_t|s_t) \right)\left(\sum_{t=1}^T r(s_t,a_t)\right)\right]
119 | $$
120 | 
121 | 这样，在连续几轮对policy的训练中，我们只需要在一开始对策略 $\pi_{\bar{\theta}}$ 进行一个采样，就可以用于几轮的训练。我们成功把off-policy的思想引入了policy gradient。
122 | 
123 | 但是这样仍然存在问题：我们注意到，这里有一个 $T$ 个数的连乘积，如果网络的输出比较大或者小，这就很容易导致等式左边（也就是梯度）过大或过小。因此，我们需要进行一些近似（或严格变形）。
124 | 
125 | ## With Causality
126 | 
127 | 这个计算有些复杂。我们首先从旧的causality表达式开始（不做importance sampling）：
128 | 
129 | $$
130 | \nabla_\theta J(\theta)=\mathbb{E}_{\tau\sim p_\theta(\tau)}\left[ \sum_{t=1}^T \nabla_\theta\log \pi_\theta(a_t|s_t)\left(\sum_{t'=t}^T r(s_{t'},a_{t'})\right)\right]
131 | $$
132 | 
133 | $$
134 | =\sum_{t=1}^T\sum_{t'=t}^T \mathbb{E}_{\tau\sim p_\theta(\tau)}\left[r(s_{t'},a_{t'}) \nabla_\theta\log \pi_\theta(a_t|s_t) \right]
135 | $$
136 | 
137 | $$
138 | =\sum_{t=1}^T\sum_{t'=t}^T \sum_{s_1,a_1,\cdots,s_T,a_T}p(s_1)\pi_\theta(a_1|s_1)\cdots p(s_T|a_{T-1},s_{T-1})\pi_\theta (a_T|s_T)r(s_{t'},a_{t'}) \nabla_\theta\log \pi_\theta(a_t|s_t)
139 | $$
140 | 
141 | （接下来这一步的思想是对概率分布使用causality，但数学上是严格等的）
142 | 
143 | $$
144 | =\sum_{t=1}^T\sum_{t'=t}^T \sum_{s_1,a_1,\cdots,s_{t'},a_{t'}}p(s_1)\pi_\theta(a_1|s_1)\cdots p(s_{t'}|a_{{t'}-1},s_{{t'}-1})\pi_\theta (a_{t'}|s_{t'})r(s_{t'},a_{t'}) \nabla_\theta\log \pi_\theta(a_t|s_t)
145 | $$
146 | 
147 | $$
148 | =\sum_{t=1}^T\sum_{t'=t}^T\mathbb{E}_{\tau_{\le t'}\sim p_{\bar{\theta}}(\tau_{\le t'})}\left[\prod_{t''=1}^{t'}\frac{{\pi_\theta}(a_{t''}|s_{t''})}{{\pi_{\bar{\theta}}}(a_{t''}|s_{t''})}r(s_{t'},a_{t'})\nabla_{\theta}\log \pi_\theta(a_t|s_t)\right]
149 | $$
150 | 
151 | $$
152 | =\sum_{t=1}^T\sum_{t'=t}^T\mathbb{E}_{\tau \sim p_{\bar{\theta}}(\tau)}\left[\prod_{t''=1}^{t'}\frac{{\pi_\theta}(a_{t''}|s_{t''})}{{\pi_{\bar{\theta}}}(a_{t''}|s_{t''})}r(s_{t'},a_{t'})\nabla_{\theta}\log \pi_\theta(a_t|s_t)\right]
153 | $$
154 | 
155 | $$
156 | =\sum_{t=1}^T\mathbb{E}_{\tau \sim p_{\bar{\theta}}(\tau)}\left[\nabla_{\theta}\log \pi_\theta(a_t|s_t)\cdot \prod_{t''=1}^{t}\frac{{\pi_\theta}(a_{t''}|s_{t''})}{{\pi_{\bar{\theta}}}(a_{t''}|s_{t''})}\sum_{t'=t}^T\left(r(s_{t'},a_{t'})\prod_{t''=t+1}^{t'}\frac{{\pi_\theta}(a_{t''}|s_{t''})}{{\pi_{\bar{\theta}}}(a_{t''}|s_{t''})}\right)\right]
157 | $$
158 | 
159 | ## With First-order Approximation
160 | 
161 | First-order approximation是指，我们丢掉 $t'\ne t$ 的importance weight项，给出
162 | 
163 | $$
164 | \nabla_\theta J(\theta)=\sum_{t=1}^T\mathbb{E}_{\tau \sim p_{\bar{\theta}}(\tau)}\left[\frac{{\pi_\theta}(a_{t}|s_{t})}{{\pi_{\bar{\theta}}}(a_{t}|s_{t})}\nabla_{\theta}\log \pi_\theta(a_t|s_t)\sum_{t'=t}^Tr(s_{t'},a_{t'})\right]
165 | $$
166 | 
167 | 乍一看，这毫无道理：为什么我们可以丢掉这么多项？实际上，这一方法更多地来自一个更高的视角。我们会在第9讲进行介绍。不管如何，这样显然减少了巨量的计算量——就算它没有什么道理，如果它work了，人们也想必会十分器重它。
168 | 
169 | 但在接受后面更高级的知识之前，不妨让我们先来从直觉上理解一下上面的表达式的原理。以下内容是来自[jzc](https://github.com/szjzc2018)的一个非常不错的理解
170 |  
171 | > 可以看到，上面的表达式相比于原先最开始的数学上严格的表达，相当于去除了除了 $t$ 时刻之外的所有importance weight。为什么这是合理的呢？
172 | 
173 | ![](./assets/5-1.jpeg)
174 | 
175 | > 我们考虑这样一张图：假设环境是决定性的，而红色标出的是我们的policy的概率。那么根据两步得到的正确概率，我们policy gradient导致的policy概率分布的梯度就可以用图中的蓝线的表示（这里蓝线越粗代表梯度越大）。比如说，从上到下第二个点的reward就比第一个点对policy的梯度影响更大。
176 | > 
177 | > 那么现在我们移去前面（也就是 $t'<t$ 的）importance sampling的项，相当于丧失了第一步的信息（标为灰色，这里假设proposal就是简单的均匀分布，50%概率），也就是训练的数据集中第一步的概率分布完全由proposal $\pi_{\bar{\theta}}$ 提供。但是即便如此，我们可以发现这个模型也可以学会第二步的正确操作：每一个点处policy的选取还是决定于概率更大的那个的reward。
178 | >
179 | > 类似地，前面移除后面的importance weight的影响也只是在于更久远的reward相差了一定的倍数。（原来的时候对于不同的 $t'>t$ ， $r(s_{t'},a_{t'})$ 有着不同的importance weight，但现在直接是加起来）。只要 $t$ 时刻的importance weight还在，整体的优化方向就大概仍然是对的。
180 | > 
181 | > 你可能会argue，这张图片里的情况是因为数字上的巧合（更改这几个概率的数值会使得结论变的不是这样）。但是我**并不是试图论证它是“完全正确的”**；相反，我们却只需要说明这个新的loss是**合理的**：这个新的loss可以理解为**前面和后面都按照importance sampling的方式走，只有第t步按照现有的policy来走**，这样一个**新的**"hybridized policy"。直观上可以想象，优化这个policy也可以带着原来的policy往正确的方向前进。
182 | 
183 | 这里，还透露出一个至深的思想：
184 | 
185 | > **小贴士**
186 | > 
187 | > RL的一个重要观点是，和DL不同，我们没有一个绝对的“目标”的概念。value只是一个proposed的目标，关键还是要我们的模型能够work。
188 | > 
189 | > 这就是为什么我们很多时候为了减少计算量可以做一些数学上不完全正确的事情：你可以理解为我们换了一个新的目标（比如上面，改成在这样的一个“hybridized”的policy下面最优化reward），只要这个目标依然合理（不能有明显的情况使得它负优化），并且数学上使得计算更简单，那我们就是稳赚不赔。
190 | 
191 | # Policy Gradient In Practice
192 | 
193 | ## Fake Loss Function for Autograd
194 | 
195 | 我们先回到只考虑causality的policy gradient的表达式：
196 | 
197 | $$
198 | \nabla_\theta J(\theta)\approx \frac{1}{N}\sum_{n=1}^N \sum_{t=1}^T \nabla_\theta\log \pi_\theta(a_t|s_t)\left(\sum_{t'=t}^T r(s_{t'},a_{t'})\right)=\frac{1}{N}\sum_{n=1}^N \sum_{t=1}^T \nabla_\theta\log \pi_\theta(a_t|s_t)\hat{Q}^{\pi_\theta}_{n,t}
199 | $$
200 | 
201 | 为了避免手动计算梯度，我们可以构造一个没有实际意义的loss，但这个loss的梯度就是 $\nabla_\theta J(\theta)$ 。它可以是
202 | 
203 | $$
204 | \tilde{J}(\theta)=\frac{1}{N}\sum_{n=1}^N \sum_{t=1}^T \log \pi_\theta(a_t|s_t)\hat{Q}^{\pi_\theta}_{n,t}
205 | $$
206 | 
207 | （注意 $\hat{Q}^{\pi_\theta}_{n,t}$ 和 $\theta$ 无关，只和路径有关）
208 | 
209 | ## Tune the Hyperparameters
210 | 
211 | - Learning rate: hard to tune, so better use Adam.
212 | - Batch size: as large as possible, since the variance is large.
213 | 
214 | # Further Policy Gradients
215 | 
216 | 我们会在第9讲继续介绍有关policy gradient的知识。届时，我们会从另外一个角度理解policy gradient，并对其提出一个重要的改进。
217 | 
218 | # Reference Papers
219 | 
220 | 1. [Guided Policy Search](https://proceedings.mlr.press/v28/levine13.html)（使用importance sampling的policy gradient方法）
221 | 2. [Infinite-horizon policy-gradient estimation](https://arxiv.org/abs/1106.0665) （介绍了temporally decomposed policy gradient）
222 | 3. [Reinforcement learning of motor skills with policy gradients](https://www.ias.informatik.tu-darmstadt.de/uploads/Publications/Publications/Neural-Netw-2008-21-682_4867[0].pdf)（介绍了optimal baselines 和 natural gradient）


--------------------------------------------------------------------------------
/lecture/notes-zh/8-Q_learning.md:
--------------------------------------------------------------------------------
  1 | # Notice
  2 | 为了前后的连贯性，本讲的一部分内容（包括Q-learning的改进等）被搬迁到了前面一讲（第七讲）。
  3 | 
  4 | ## Review: General Framework of Q-Learning
  5 | 
  6 | Q-Learning 中分为三个 process:
  7 | 
  8 | - Data Collection (online; replay buffer)。通过和环境的交互获取数据存在 buffer 中；同时可能会去除旧数据。
  9 | - Target update。更新 target $\phi_0\leftarrow f(\phi_0,\phi)$。之前介绍过 $f=\phi$ 和 $f=0.999\cdot \phi_0+0.001\cdot \phi$ 两种方法。
 10 | - $Q$-function regression。通过 gradient step 更新 $Q$。
 11 | 
 12 | 不同的算法，对每个 process 有不同的处理方式；并且进行它们的速度也有区别。
 13 | 
 14 | - Online Q-Learning: 三个 process 同步进行。
 15 | - DQN: Process 1,3 同步进行；Process 2 更慢（在充分更新 $\phi$ 后 $\phi_0\leftarrow \phi$）。
 16 | - Fitted Q-iteration: Process 3 是 Process 2 的 inner-loop，Process 2 是 Process 1 的 inner-loop。
 17 | 
 18 | 
 19 | # Advanced Q-Learning
 20 | 
 21 | 前面介绍的DQN看起来已经解决了大部分算法中的不合理之处或可能的问题。实际上，DQN也有极好的应用效果。但是它还有一定的改进空间。
 22 | 
 23 | ## Double Q-learning
 24 | 
 25 | ### Over-confidenence of Q values
 26 | 
 27 | 大量的实验结果表明，我们的模型学习出的Q值往往在多次训练后高于定义值
 28 | 
 29 | $$
 30 | Q(s_t,a_t)=r(s_t,a_t)+\mathbb{E}_{s_{t+1},a_{t+1},\cdots}\left[\sum_{t'>t}\gamma^{t'-t} r(s_{t'},a_{t'})\right]
 31 | $$
 32 | 
 33 | 这看似很奇怪，但因为它是系统误差而不是随机误差，所以我们必须要给出一个解释。实际上，也相对直接。我们从模型迭代的表达式出发
 34 | 
 35 | $$
 36 | Q_\phi(s_t,a_t)\leftarrow r(s_t,a_t)+\gamma\max_{a'}Q_\phi(s_{t+1},a')
 37 | $$
 38 | 
 39 | 首先注意到以下的事实：
 40 | 
 41 | $$
 42 | \mathbb{E}[\max\{x_1,x_2,\cdots,x_n\}]\ge \max\{\mathbb{E}[x_1],\mathbb{E}[x_2],\cdots,\mathbb{E}[x_n]\}
 43 | $$
 44 | 
 45 | 所以，我们可以做如下的想象：假设开始的时候 $Q_\phi$ 值接近于真值，但具有一个随机误差 $\epsilon(s,a)$ ，满足其对 $(s,a)$ 的期望为零：
 46 | 
 47 | $$
 48 | Q_\phi(s,a)=Q^\star (s,a)+\epsilon(s,a)
 49 | $$
 50 | 
 51 | 那么，我们就会有
 52 | 
 53 | $$
 54 | \max_{a}Q_\phi(s,a)=\max_{a}Q^\star(s,a)+\bar{\epsilon}(s)
 55 | $$
 56 | 
 57 | 并且 $\mathbb{E}[\bar{\epsilon}(s)]\ge 0$ 。这样，在下一次迭代的时候，位于等式左边的 $Q_\phi(s_t,a_t)$ 就会有一个期待值非负的误差。这样，反复的迭代，我们的 $Q$ 就会偏离地越来越大。
 58 | 
 59 | 这一问题实际上是上一次介绍的DQN在实践中的**主要问题**。为了解决它，引入了**Double Q-learning**。
 60 | 
 61 | ### Double Q-learning
 62 | 
 63 | 如何解决这个问题呢？一个比较关键的步骤是想清楚为什么 $
 64 | \mathbb{E}[\max\{x_1,x_2,\cdots,x_n\}]\ge \max\{\mathbb{E}[x_1],\mathbb{E}[x_2],\cdots,\mathbb{E}[x_n]\} $。我们会发现，实际上是因为左边每一个点取出argmax再evaluate，这就会导致有一个偏大的bias。同样地，应用到我们的场景里，就是
 65 | 
 66 | $$
 67 | \max_{a}Q(s,a)=Q(s,\arg\max_a Q(s,a))
 68 | $$
 69 | 
 70 | 可以看到第二个表达式出现了两个 $Q$ ；二者实现了类似“正反馈”的关系，导致误差总是偏大。一个自然的想法就是我们引入**两个网络**：
 71 | 
 72 | $$
 73 | Q^{A}_\phi(s_t,a_t)\leftarrow r(s_t,a_t)+\gamma Q^B_\phi(s_{t+1},\arg\max_a Q^A_\phi(s_{t+1},a))
 74 | $$
 75 | 
 76 | $$
 77 | Q^{B}_\phi(s_t,a_t)\leftarrow r(s_t,a_t)+\gamma Q^A_\phi(s_{t+1},\arg\max_a Q^B_\phi(s_{t+1},a))
 78 | $$
 79 | 
 80 | 这样就避免了“正反馈”的问题。更进一步地，我们发现这个方法甚至可以实现“负反馈”，也就是“自己给自己纠错”：如果对于某一个 $a_t$ ， $Q^A$ 的计算偏大而 $Q^B$ 的计算相对准确，那么 $Q^A$ 在更新的时候就会被 $Q^B$ 纠正（注意 $Q^A$ 本身的数值在第一个update中并不重要，因为只是求argmax）；另一方面，对于 $Q^B$ 的更新而言，其并不会完全受到 $Q^A$ 的影响，因为在第二个update中，取哪个action是 $Q^B$ 决定的，从而可以大概率避开 $Q^A$ 的错误点。这样，我们就避免了过度估计的问题。
 81 | 
 82 | 事实上，我们可以证明，如果 $Q^A$ 和 $Q^B$ 是完全无关的，并且 $\mathbb{E} Q^A_\phi(s,a)=\mathbb{E} Q^B_\phi(s,a)=Q(s,a)$ (它们是无偏估计)，就有
 83 | $$\mathbb{E}[Q^A_\phi(s,\arg\max_a Q^B_\phi(s,a))]\le \max_a Q(s,a).$$
 84 | 所以，相比普通 Q-learning 的 $Q^A=Q^B$, Double Q-learning 相当于减少了 $Q^A,Q^B$ 之间的**相关性**，通过上面的不等式，直观上就能缓解 overestimation。
 85 | 
 86 | 当然，在实践上，我们并不会多去训练一个网络（训练两个网络干一件事，听起来就很逆天）。相反，我们刚好利用两组参数 $\phi_0$ （老的参数）和 $\phi$ ，分别对应 $Q^A$ 和 $Q^B$ 。当然，为了保证稳定性，我们就不对 $\phi_0$ 更新了，从而只做
 87 | 
 88 | $$
 89 | Q_\phi(s_t,a_t)\leftarrow r(s_t,a_t)+\gamma Q_{\phi_0}(s_{t+1},\arg \max Q_\phi(s_{t+1},a))
 90 | $$
 91 | 
 92 | （虽然理论上，这个方法不是完整的；但实验上跑的还不错，所以就不管了！）这样，再结合fitting的部分，我们就可以写出
 93 | $$
 94 | [Q(s_t,a_t)]^{\star}=r(s_t,a_t)+\gamma Q_{\phi_0}(s_{t+1},\arg \max Q_{\textcolor{red}{\phi}}(s_{t+1},a))
 95 | $$
 96 | 
 97 | $$
 98 | \phi\leftarrow \arg\min_{\phi}\left([Q(s_t,a_t)]^{\star}-Q_\phi(s_t,a_t)\right)^2
 99 | $$
100 | 
101 | 对比原来的算法，可以看到这里的唯一改动就是，计算argmax并不使用 $\phi_0$ ，而是使用 $\phi$ 。只需要在原来的代码里稍微修改一点点，就立刻会有巨大的提升。Double Q-learning的力量就在这里。
102 | 
103 | ## Multi-step returns
104 | 
105 | 类似policy gradient，我们这里也遇到了采样的variance和引入模型的bias的tradeoff。因此同样，我们可以利用多步的力量，减少我们的模型带来的bias。
106 | 
107 | $$
108 | [Q(s_t,a_t)]^{\star}=\mathbb{E}_{s_{t+1},a_{t+1},\cdots}\left[\sum_{t'=t}^{t+N-1}\gamma^{t'-t}r(s_{t'},a_{t'})\right]+\gamma^N \max_a Q_{\phi}(s_{t+N},a)
109 | $$
110 | 
111 | 但必须注意，在 $N>1$ 的时候，这个表达式就变成**on-policy**的了！（因为 $a_{t+1}$ 从 $s_{t+1}$ 中采样的方式是由 $\pi$ 决定的）但如果要on-policy地训练，又和之前的replay buffer的优化冲突了。
112 | 
113 | 一种解决方法是直接在 replay buffer 里面获取数据，强行当作 off-policy 来训练。事实上效果还不错。另一种可能的方法：在 DQN 中用 fixed policy $\phi_0$ 做 importance sampling。
114 | 
115 | ## Q learning in continuous action space
116 | 
117 | 和之前的policy gradient不同，Q-learning算法延伸到continous action space的难度略高。这是因为update过程有一个max操作。在连续的空间如何实现这个max呢？有三种方法。
118 | 
119 | 1. optimization：在每一次需要取max的时候，我们用一个optimizer（比如GD或者不使用梯度的stochastic optimization方法等）来选取 $a$ 使得最大化 $Q(s,a)$ 。问题也很明显——这会导致计算太慢了。
120 | 
121 | 2. **NAF(Normalized Advantage Functions)**: 我们选取 $Q$ 函数必须为指定的形式： $Q(s,a)=-\frac{1}{2}(a-\mu_\phi(s))^TM_\phi(s)(a-\mu_\phi(s))+V_\phi(s)$ ，其中包含了若干神经网络作为参数。这样计算确实可以很快，但是这个方法限制了模型的表达能力：$Q$ 只能是 $a$ 的二次函数！
122 | 
123 | 3. Learn an approximate maximizer: 我们在训练 $Q$ 的同时还训练另外一个网络 $\pi_\theta$ ，使得 $\pi_\theta(s)\approx \arg\max_a Q(s,a)$ 。训练方法也很直接：对 $\theta$ 最大化 $Q(s,\pi_\theta(a))$ 即可。可以发现这个方法实际上是最好的。同时，我们还能看到，我们似乎又再一次引入了policy $\pi$ ！这一方法也被称为“deterministic actor critic method”。
124 | 
125 | 使用上面的方法3,人们得到了著名的**DDPG Algorithm**：
126 | 
127 | > **DDPG Algorithm**
128 | 
129 | 重复：
130 | 1. 从环境中根据某种policy采样一个 $(s,a,s',r)$ ，加入replay buffer $B$ ；
131 | 2. 从replay buffer取出一个batch(相当于 $K=1$ )，计算目标 $[Q(s,a)]^\star=r(s,a)+\gamma Q_{\phi_0}(s',\pi_{\theta_0}(s'))$ ；
132 | 3. 对 $L(\phi)=\sum_{(s,a)}([Q(s,a)]^\star-Q_\phi(s,a))^2$ 作一步梯度下降；
133 | 4. 对 $L(\theta)=-\sum_s Q_\phi(s,\pi_\theta(s))$ 做一步梯度下降；
134 | 5. 更新 $\phi_0,\theta_0$ ，可以使用隔 $N$ 次更新一次的方法，也可以使用Polyak平均的方法。
135 | 
136 | 而类似地，我们还可以学习一个不是deterministic的maximizer。这就是 **SAC (Soft Actor Critic)** 算法。
137 | 
138 | ## SAC Algorithm
139 | 
140 | 首先，相比传统的 RL objective, SAC 使用了 **maximum entropy objective**:
141 | $$\mathcal{J}=\sum\limits_t \mathbb{E}_{(s_t,a_t)\sim \pi} [R(s_t,a_t)+\alpha\cdot \mathcal{H}\pi(\cdot|s_t)],$$
142 | 其中 $\alpha\ge 0$, $\mathcal{H}$ 表示**信息熵**。额外加入 $\alpha\cdot \mathcal{H}\pi(\cdot|s_t)$ 这一项的好处是：能鼓励模型做更多的 exploration；找到 sub-optimal solutions，从而能够最终收敛于在众多可行的方案中的随机分布。
143 | 
144 | 因此，我们需要改进由此得到的 value function / Q-function:
145 | $$V(s_t)=\mathbb{E}_{a_t\sim \pi}[Q(s_t,a_t)-\log \pi(a_t|s_t)],$$
146 | $$Q(s_t,a_t)=R(s_t,a_t)+\gamma \mathbb{E}_{s_{t+1}\sim \mathcal{T}}[V(s_{t+1})].$$
147 | 首先，我们需要 parameterize $V,Q\to V_{\psi},Q_\theta$; objective 就是上面两个式子的 Bellman divergence。在原论文中，我们需要把 $\theta$ 的 objective 中 $V$ 的部分改为 old target 来稳定训练，target 的更新方式采用 Polyak averaging
148 | $$\bar \psi \leftarrow \tau\cdot \bar \psi+(1-\tau)\cdot \psi.$$
149 | 
150 | 现在来看 $\pi$ 的学习。根据定义，$V,Q$ 都代表着从这一步开始 objective 的期望。假设 $V,Q$ 固定，我们需要更新 $\pi$, 也就是对每个 $s_t$ 最大化
151 | $$\mathbb{E}_{a_t\sim \pi}[Q(s_t,a_t)-\log \pi(a_t|s_t)].$$
152 | 求导后易得 $\pi(a_t|s_t)$ 正比于 $\exp\{Q(s_t,a_t)\}$. 现在考虑用一个模型 $\phi$ 学习 $\pi$: $\phi$ 能表示的空间是某个子空间 $\Pi$. 那么，最好的 $\pi$ 应该被表示为
153 | $$\pi^\star=\arg \min\limits_{\pi\in \Pi} \text{KL}\left(\pi(\cdot|s_t)\Big|\Big|\dfrac{\exp\{Q(s_t,\cdot)\}}{Z(s_t)}\right).$$
154 | 为了把 $\pi$ 变成 stochastic 的形式，我们需要进行 parameterization：
155 | $$a_t=f_\phi(s_t,\epsilon_t),\quad \epsilon_t\sim \mathcal{N}(0,I).$$
156 | 这样，对于 $\phi$，我们的 objective 就是
157 | $$\mathcal{J}_\pi(\phi)=\mathbb{E}_{s_t\sim \pi,\epsilon_t}[\log f_\phi(s_t,\epsilon_t)-Q_\theta(s_t,f_\phi(s_t,\epsilon_t))].$$
158 | 至此，我们就可以写出 SAC 的 training process: 每一轮先根据 $\pi_\theta$ 和环境交互产生训练数据，再依次对 $V_\psi,Q_\theta,\pi_\phi$ 作一步 gradient step，然后更新 target $\bar \psi$。
159 | 
160 | 值得注意的是，原论文中还提出了一个优化：考虑到 $Q$ 的 over-estimation 问题，我们同时训练两个 $Q_1,Q_2$。训练 $V,\pi$ 时，定义 $Q=\min\{Q_1,Q_2\}$。
161 | 
162 | # Implementing Q learning
163 | 
164 | 我们已经知道，理论上Q-learning是不收敛的。因此，参数的调整必须十分地小心。有以下的tip：
165 | - Large replay buffer make training stable：这相当于训练的数据集很大，可以更好地收敛；
166 | - Gradually reduce exploration & learning rate：开始的时候应该多explore，以防止陷入一个局部最优解；但最后摸清了环境的套路后，应该专注在把最好的 $Q$ 值做的更精确上。
167 |     - **ADAM** optimizer can help!
168 | - **Huber loss** for training Q network：`huber_loss`是 $\dfrac{x^2}{2}$ 的“截断”版本，也就是当 $x$ 很大的时候loss是线性而非二次的。
169 |     - 为什么这样做有效？还是上一讲的那个例子：如果一个batch里面有一些Q为1,2,3的数据，还有一个Q为-1000000的数据，那么参数肯定会被后者的巨大梯度拽到使得那个很差的Q最精确的地方。但我们实际上并不关心那个很差的Q到底是-1000000还是-500000，因此这个截断版本的loss很有效。
170 |     - 另一个解决梯度过大的方法是 **clipping**。
171 | - **Always** use Double Q learning: it has no downsides
172 | - 用不同的 random seed 跑模型，因为每一次的结果十分 inconsistent！
173 | - Sometimes use multi-step returns (but it has a theoretical error)
174 | 
175 | # Reference Papers
176 | 
177 | 1. [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)（DDPG）
178 | 2. [Continuous deep Q-learning with model-based acceleration](https://arxiv.org/abs/1603.00748)（NAF）
179 | 3. [Dueling network architectures for deep reinforcement learning](https://arxiv.org/abs/1511.06581)


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/10-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/11-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/12-1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/12-1.jpeg


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/14-1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/14-1.jpeg


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/14-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/14-2.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/14-3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/14-3.jpeg


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/15-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/16-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/17-1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/17-1.jpeg


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/18-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/19-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/19-2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/19-2.jpeg


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/19-3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/19-3.jpeg


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/2-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/21-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/21-2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/21-2.jpeg


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/22-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/22-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/22-2.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/22-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/22-3.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/22-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/22-4.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/23-1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/23-1.jpeg


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/23-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/23-2.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/4-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/5-1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/5-1.jpeg


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/9-1.png


--------------------------------------------------------------------------------
/lecture/notes-zh/assets/not_implement.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/lecture/notes-zh/assets/not_implement.png


--------------------------------------------------------------------------------
/lecture/notes-zh/change.py:
--------------------------------------------------------------------------------
 1 | import re,os
 2 | 
 3 | def change(in_file,out_file):
 4 |     lines = open(in_file).readlines()
 5 |     outlines = []
 6 |     for line in lines:
 7 |         line = line.strip('\n') # changed here
 8 |         if line == '':
 9 |             outlines.append('')
10 |             continue
11 |         if '$' in line:
12 |             splited = line.split('$')
13 |             out = []
14 |             for i,item in enumerate(splited):
15 |                 if i % 2 == 1:
16 |                     pre = '' if (splited[i-1] == '' or splited[i-1][-1] in '*~ ') else ' '
17 |                     post = '' if (i+1 == len(splited) or splited[i+1]=='' or splited[i+1][0] in '*~ ') else ' '
18 |                     out.append(pre + '$' + item + '$' + post)
19 |                 else:
20 |                     out.append(item)
21 |             outlines.append(''.join(out))
22 |         else:
23 |             outlines.append(line)
24 |     open(out_file, 'w').write('\n'.join(outlines))
25 | 
26 | if __name__ == '__main__':
27 |     mds = os.listdir('.')
28 |     mds = [md for md in mds if md.endswith('.md')]
29 |     print(mds)
30 |     for md in mds:
31 |         change(md,md)


--------------------------------------------------------------------------------
/lecture/notes-zh/lecture:
--------------------------------------------------------------------------------
1 | ..


--------------------------------------------------------------------------------
/tutorials/assets/0-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hidden-Hyperparameter/RL-notes/2329d7ae317c27c5f321bf2a7d9b352997c71a54/tutorials/assets/0-1.png


--------------------------------------------------------------------------------
/tutorials/install.sh:
--------------------------------------------------------------------------------
1 | sudo apt install ffmpeg


--------------------------------------------------------------------------------
/tutorials/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | gymnasium==0.29
3 | gymnasium[classic-control]
4 | 
5 | # fill in your torch version here, since installing torch takes a long time.
6 | # torch
7 | tqdm


--------------------------------------------------------------------------------
/tutorials/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import matplotlib.animation as animation
 3 | import numpy as np
 4 | from IPython.display import Video, display, display_markdown, Markdown
 5 | from typing import List
 6 | 
 7 | import torch
 8 | torch.manual_seed(3407)
 9 | torch.backends.cudnn.deterministic = True
10 | device = 'cpu'
11 | # if torch.cuda.is_available(): device = 'cuda'
12 | # if torch.backends.mps.is_available(): device = 'mps'
13 | print('Your device is',device)
14 | 
15 | # display utils
16 | def show_image(image:np.ndarray):
17 |     plt.imshow(image)
18 |     plt.axis('off')
19 |     plt.show()
20 |     plt.close()
21 | 
22 | def show_video(frames:List[np.ndarray],title=None):
23 |     fig = plt.figure(figsize=(6, 4))
24 |     plt.axis('off')
25 |     im = plt.imshow(frames[0])
26 |     def update(frame):
27 |         im.set_array(frame)
28 |         return [im]
29 |     
30 |     ani = animation.FuncAnimation(fig, update, frames=frames, interval=50, blit=True)
31 |     video_path = '/tmp/video.mp4'
32 |     ani.save(video_path, writer='ffmpeg')
33 |     plt.close(fig)
34 |     if title: display_markdown(Markdown(f'### {title}'))
35 |     display(Video(video_path, embed=True))
36 | 
37 | # ipynb utils
38 | 
39 | def add_method_to_class(cls):
40 |     def decorator(func):
41 |         setattr(cls, func.__name__, func)
42 |         return func
43 |     return decorator
44 | 
45 | def update_meth_to_obj(obj,meth_name:str):
46 |     setattr(obj, meth_name, lambda *args: getattr(obj.__class__,meth_name)(obj,*args))


--------------------------------------------------------------------------------