├── .gitignore
├── .gitmodules
├── LICENSE.md
├── ch_0_rl_rl_in_cv
    ├── RL_0.ipynb
    ├── RL_0.slides.html
    ├── img
    │   ├── 1.png
    │   ├── 1_1.png
    │   ├── 1_10.png
    │   ├── 1_11.png
    │   ├── 1_12.png
    │   ├── 1_2.png
    │   ├── 1_3.png
    │   ├── 1_4.png
    │   ├── 1_5.png
    │   ├── 1_6.png
    │   ├── 1_7.png
    │   ├── 1_8.png
    │   ├── 1_9.png
    │   ├── 2.png
    │   ├── 2_1.png
    │   ├── 2_10.png
    │   ├── 2_11.png
    │   ├── 2_13.png
    │   ├── 2_2.png
    │   ├── 2_3.png
    │   ├── 2_4.png
    │   ├── 2_5.png
    │   ├── 2_7.png
    │   ├── 2_8.png
    │   ├── 2_9.png
    │   ├── 3.png
    │   ├── 4.png
    │   ├── 5.png
    │   ├── 6.png
    │   ├── 7.png
    │   ├── 8.png
    │   ├── a.png
    │   ├── adnet_1.PNG
    │   ├── adnet_2.PNG
    │   ├── adnet_3.PNG
    │   ├── adnet_4.PNG
    │   ├── adnet_5.PNG
    │   ├── adnet_6.PNG
    │   ├── adnet_7.PNG
    │   ├── adnet_8.PNG
    │   ├── b.png
    │   ├── c.png
    │   ├── d.png
    │   ├── e.png
    │   ├── f.png
    │   ├── x.png
    │   ├── y.png
    │   └── z.png
    └── readme.md
├── ch_10_func_approx_2
    ├── RL_FA2.ipynb
    ├── RL_FA2.slides.html
    ├── img
    │   ├── 1.png
    │   ├── 2.png
    │   ├── 3.png
    │   ├── 4.png
    │   ├── 5.png
    │   ├── 6.png
    │   ├── 7.png
    │   ├── ex.png
    │   ├── fa2_ex1.JPG
    │   ├── fa2_ex2.JPG
    │   ├── fa2_ex3.JPG
    │   ├── fa2_slides1.JPG
    │   ├── fa2_slides10.JPG
    │   ├── fa2_slides11.JPG
    │   ├── fa2_slides12.JPG
    │   ├── fa2_slides2.JPG
    │   ├── fa2_slides3.JPG
    │   ├── fa2_slides4.JPG
    │   ├── fa2_slides5.JPG
    │   ├── fa2_slides6.JPG
    │   ├── fa2_slides7.JPG
    │   ├── fa2_slides8.JPG
    │   └── fa2_slides9.JPG
    └── readme.md
├── ch_11_policy_gradient
    ├── RL_11.ipynb
    ├── RL_11.slides.html
    ├── img
    │   ├── pg_1.JPG
    │   ├── pg_10.JPG
    │   ├── pg_11.JPG
    │   ├── pg_12.JPG
    │   ├── pg_13.JPG
    │   ├── pg_14.JPG
    │   ├── pg_15.JPG
    │   ├── pg_16.JPG
    │   ├── pg_17.JPG
    │   ├── pg_18.JPG
    │   ├── pg_19.JPG
    │   ├── pg_2.JPG
    │   ├── pg_20.JPG
    │   ├── pg_21.JPG
    │   ├── pg_22.JPG
    │   ├── pg_23.JPG
    │   ├── pg_24.JPG
    │   ├── pg_25.JPG
    │   ├── pg_26.JPG
    │   ├── pg_27.JPG
    │   ├── pg_28.JPG
    │   ├── pg_29.JPG
    │   ├── pg_3.JPG
    │   ├── pg_30.JPG
    │   ├── pg_34.JPG
    │   ├── pg_4.JPG
    │   ├── pg_5.JPG
    │   ├── pg_6.JPG
    │   ├── pg_7.JPG
    │   ├── pg_8.JPG
    │   ├── pg_9.JPG
    │   ├── sutton_1.JPG
    │   ├── sutton_2.JPG
    │   ├── sutton_3.JPG
    │   ├── sutton_4.JPG
    │   └── sutton_5.JPG
    └── readme.md
├── ch_1_rl_intro
    ├── .ipynb_checkpoints
    │   ├── RL_1-checkpoint.ipynb
    │   └── readme-checkpoint.ipynb
    ├── RL_1.ipynb
    ├── RL_1.slides.html
    ├── img
    │   ├── 1.png
    │   ├── 10.png
    │   ├── 11.png
    │   ├── 12.png
    │   ├── 13.png
    │   ├── 2.png
    │   ├── 3.png
    │   ├── 6.png
    │   ├── 7.png
    │   ├── 8.png
    │   ├── 9.png
    │   ├── e1.png
    │   ├── e2.png
    │   ├── e3.png
    │   └── e4.png
    ├── readme.ipynb
    └── readme.md
├── ch_2_rl_in_non_associative
    ├── .ipynb_checkpoints
    │   ├── RL_2-checkpoint.ipynb
    │   ├── tutorial-checkpoint.ipynb
    │   └── tutorial_solutions-checkpoint.ipynb
    ├── RL_2.ipynb
    ├── RL_2.slides.html
    ├── img
    │   ├── UCB.JPG
    │   ├── com_2.jpg
    │   ├── grad.jpg
    │   ├── greedyvs.jpg
    │   ├── mistake.JPG
    │   ├── mistake_1.JPG
    │   ├── mistake_2.JPG
    │   ├── multiarmedbandit.jpg
    │   ├── optinit.JPG
    │   └── testbed.JPG
    ├── readme.md
    ├── references.md
    ├── tutorial.ipynb
    └── tutorial_solutions.ipynb
├── ch_3_rl_finite_mdp
    ├── .ipynb_checkpoints
    │   ├── RL_3 - Copy-checkpoint.ipynb
    │   └── RL_3-checkpoint.ipynb
    ├── RL_3.ipynb
    ├── RL_3.slides.html
    ├── img
    │   ├── 1.png
    │   ├── 2.png
    │   ├── 3.png
    │   ├── 4.png
    │   ├── 5.png
    │   ├── 6.png
    │   ├── 61.png
    │   ├── 7.png
    │   ├── agent_env.PNG
    │   ├── b1.png
    │   ├── b2.png
    │   ├── b3.png
    │   ├── b4.png
    │   ├── b5.png
    │   ├── o.png
    │   ├── o1.png
    │   ├── o2.png
    │   ├── op1.png
    │   ├── op2.png
    │   ├── op3.png
    │   ├── op4.png
    │   ├── pic1.png
    │   ├── pic2.png
    │   ├── return.png
    │   ├── robot.jpg
    │   ├── slides_1.PNG
    │   ├── slides_10.PNG
    │   ├── slides_2.PNG
    │   ├── slides_3.PNG
    │   ├── slides_4.PNG
    │   ├── slides_5.PNG
    │   ├── slides_6.PNG
    │   ├── slides_7.PNG
    │   ├── slides_8.PNG
    │   ├── slides_9.PNG
    │   └── unified.png
    └── readme.md
├── ch_4_rl_dynamic_programming
    ├── .ipynb_checkpoints
    │   ├── RL_4-checkpoint.ipynb
    │   └── readme-checkpoint.ipynb
    ├── RL_4.ipynb
    ├── RL_4.slides.html
    ├── img
    │   ├── a.png
    │   ├── aaa.png
    │   ├── aaaa.png
    │   ├── async_1.PNG
    │   ├── async_2.PNG
    │   ├── async_3.PNG
    │   ├── async_ex_1.PNG
    │   ├── async_ex_2.PNG
    │   ├── async_ex_3.PNG
    │   ├── async_ex_4.PNG
    │   ├── b.png
    │   ├── b1.png
    │   ├── b2.png
    │   ├── contr_1.PNG
    │   ├── contr_2.PNG
    │   ├── contr_3.PNG
    │   ├── d.png
    │   ├── dp_ex_1.PNG
    │   ├── e.png
    │   ├── e1.png
    │   ├── e2.png
    │   ├── e3.png
    │   ├── p1.png
    │   ├── p2.png
    │   ├── s1.png
    │   ├── s11.png
    │   ├── s2.png
    │   ├── s3.png
    │   ├── sa.png
    │   ├── sb.png
    │   ├── v1.png
    │   ├── v2.png
    │   ├── v3.png
    │   ├── v4.png
    │   └── v5.png
    └── readme.md
├── ch_5_rl_mc_methods
    ├── .ipynb_checkpoints
    │   ├── RL_5-checkpoint.ipynb
    │   └── readme-checkpoint.ipynb
    ├── RL_5.ipynb
    ├── RL_5.slides.html
    ├── img
    │   ├── 1.png
    │   ├── 2.png
    │   ├── 3.png
    │   ├── 4.png
    │   ├── 5.png
    │   ├── 6.png
    │   ├── a1.png
    │   ├── a2.png
    │   ├── a3.png
    │   ├── a4.png
    │   ├── a5.png
    │   ├── a6.png
    │   ├── a7.png
    │   ├── c1.png
    │   ├── c2.png
    │   ├── c3.png
    │   ├── c4.png
    │   ├── c5.png
    │   ├── c6.png
    │   ├── c7.png
    │   ├── imp_sam_1.PNG
    │   ├── imp_sam_2.PNG
    │   ├── imp_sam_3.PNG
    │   ├── imp_sam_4.PNG
    │   └── imp_sam_5.PNG
    ├── readme.ipynb
    └── readme.md
├── ch_6_td_methods
    ├── RL_6.ipynb
    ├── RL_6.slides.html
    ├── img
    │   ├── 10_2.png
    │   ├── DPback.JPG
    │   ├── MCback.JPG
    │   ├── TDback.JPG
    │   ├── TDex1.JPG
    │   ├── backup_q.JPG
    │   ├── batch_td.PNG
    │   ├── bootsam.JPG
    │   ├── cliff.jpg
    │   ├── doubleq.JPG
    │   ├── ex_sarsa.JPG
    │   ├── maxbias.JPG
    │   ├── mcvstd.PNG
    │   ├── mcvstd_2.PNG
    │   ├── qvssarsa.JPG
    │   ├── tdex2.JPG
    │   └── tdmarkov.jpg
    └── readme.md
├── ch_7_rl_eligibility_traces
    ├── .ipynb_checkpoints
    │   └── RL_7-checkpoint.ipynb
    ├── RL_7.ipynb
    ├── RL_7.slides.html
    ├── img
    │   ├── 1.png
    │   ├── 10.png
    │   ├── 10_1.png
    │   ├── 10_2.png
    │   ├── 11.png
    │   ├── 12.png
    │   ├── 13.png
    │   ├── 14.png
    │   ├── 15.png
    │   ├── 16.png
    │   ├── 17.png
    │   ├── 18.png
    │   ├── 19.png
    │   ├── 2.png
    │   ├── 20.png
    │   ├── 21.png
    │   ├── 3.png
    │   ├── 4.png
    │   ├── 5.png
    │   ├── 6.png
    │   ├── 7.png
    │   ├── 8.png
    │   ├── 9.png
    │   ├── bp.png
    │   ├── eg1.png
    │   ├── eq1.png
    │   ├── eq2.png
    │   ├── importancesampling.png
    │   ├── nStepOnline.png
    │   ├── prediction.png
    │   ├── return.png
    │   ├── return1.png
    │   ├── sc1.png
    │   ├── sc2.png
    │   ├── update.png
    │   └── update1.png
    └── readme.md
├── ch_8_model_based
    ├── RL_8.ipynb
    ├── RL_8.slides.html
    ├── img
    │   ├── dyna.jpeg
    │   ├── dyna_eq.JPG
    │   ├── dyna_perf.JPG
    │   ├── dyna_perf2.JPG
    │   ├── dynaenvchange.JPG
    │   ├── dynaenvchange2.JPG
    │   ├── mcts0.JPG
    │   ├── mcts1.JPG
    │   ├── mcts2.JPG
    │   ├── mcts3.JPG
    │   ├── mcts4.JPG
    │   ├── mcts5.JPG
    │   ├── mctssearch1.JPG
    │   ├── mctssearch2.JPG
    │   ├── mctssteps.JPG
    │   ├── modelbasedplanning.JPG
    │   ├── psweep.JPG
    │   ├── psweep_ex.JPG
    │   ├── simmontesearch.JPG
    │   └── sslearning.JPG
    └── readme.md
├── ch_9_func_approx_1
    ├── RL_FA1.ipynb
    ├── RL_FA1.slides.html
    ├── img
    │   ├── fa_avg1.JPG
    │   ├── fa_avg2.JPG
    │   ├── fa_avg3.JPG
    │   ├── fa_avg4.JPG
    │   ├── fa_avg5.JPG
    │   ├── fa_avg6.JPG
    │   ├── fa_avg7.JPG
    │   ├── fa_avg8.JPG
    │   ├── fa_avg9.JPG
    │   ├── fa_prob1.JPG
    │   ├── fa_prob2.JPG
    │   ├── fa_prob3.JPG
    │   ├── fa_prob4.JPG
    │   ├── fa_prob5.JPG
    │   ├── fa_prob6.JPG
    │   ├── fa_prob7.JPG
    │   ├── fa_prob8.JPG
    │   ├── fa_slide1.JPG
    │   ├── fa_slides10.JPG
    │   ├── fa_slides11.JPG
    │   ├── fa_slides12.JPG
    │   ├── fa_slides13.JPG
    │   ├── fa_slides14.JPG
    │   ├── fa_slides2.JPG
    │   ├── fa_slides3.JPG
    │   ├── fa_slides4.JPG
    │   ├── fa_slides5.JPG
    │   ├── fa_slides6.JPG
    │   ├── fa_slides7.JPG
    │   ├── fa_slides8.JPG
    │   ├── fa_slides9.JPG
    │   └── func_approx.JPG
    └── readme.md
├── img
    ├── break_1.png
    ├── motivation.png
    └── statement_hinton_bengio_lecun.png
└── readme.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | */.ipynb_checkpoints/
 5 | # C extensions
 6 | *.so
 7 | ## for the website
 8 | _site/
 9 | .sass-cache/
10 | .jekyll-metadata
11 | .DS_Store
12 | Gemfile.lock
13 | 
14 | # Distribution / packaging
15 | .Python
16 | env/
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # PyInstaller
33 | #  Usually these files are written by a python script from a template
34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 | 
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 | 
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *,cover
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Django stuff:
57 | *.log
58 | 
59 | # Sphinx documentation
60 | docs/_build/
61 | 
62 | # PyBuilder
63 | target/
64 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "reveal.js"]
2 | 	path = reveal.js
3 | 	url = https://github.com/hakimel/reveal.js.git
4 | 	branch = master
5 | [submodule "ch_2_rl_in_non_associative/multi_arm_bandits"]
6 | 	path = ch_2_rl_in_non_associative/multi_arm_bandits
7 | 	url = https://github.com/BardOfCodes/multi_arm_bandits.git
8 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Aditya Ganeshan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_1.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_10.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_11.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_12.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_2.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_3.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_4.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_5.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_6.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_7.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_8.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/1_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_9.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_1.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_10.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_11.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_13.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_2.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_3.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_4.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_5.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_7.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_8.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/2_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_9.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/3.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/4.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/5.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/6.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/7.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/8.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/a.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/adnet_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_1.PNG


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/adnet_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_2.PNG


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/adnet_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_3.PNG


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/adnet_4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_4.PNG


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/adnet_5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_5.PNG


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/adnet_6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_6.PNG


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/adnet_7.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_7.PNG


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/adnet_8.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_8.PNG


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/b.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/c.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/d.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/e.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/f.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/f.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/x.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/y.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/y.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/img/z.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/z.png


--------------------------------------------------------------------------------
/ch_0_rl_rl_in_cv/readme.md:
--------------------------------------------------------------------------------
 1 | # <center>Deep Reinforcement Learning in CV (3 papers)</center>
 2 | 
 3 | ## Contents:
 4 | 
 5 | 1) **We will look at the following tasks:**
 6 | 	* Object Detection
 7 | 		* Paper: Caicedo, Juan C., and Svetlana Lazebnik. "Active object localization with deep reinforcement learning." Proceedings of the IEEE International Conference on Computer Vision. 2015.
 8 | 	* Action Detection
 9 | 		* Paper: Huang, Jingjia, et al. "A Self-Adaptive Proposal Model for Temporal Action Detection based on Reinforcement Learning." arXiv preprint arXiv:1706.07251 (2017).
10 | 	* Visual Tracking
11 | 		* Paper: Yoo, Sangdoo Yun1 Jongwon Choi1 Youngjoon, Kimin Yun, and Jin Young Choi. "Action-Decision Networks for Visual Tracking with Deep Reinforcement Learning".
12 | 
13 | 2) **For each Task we answer these:**
14 | 	* What is the task?
15 | 	* Can we identify the RL components:
16 | 		* State Space
17 | 		* Action Space
18 | 		* Reward System
19 | 	* Network Architecture
20 | 	* Why use RL for this Tasks
21 | 
22 | 


--------------------------------------------------------------------------------
/ch_10_func_approx_2/RL_FA2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# <center>Value Function Approximation</center>\n",
 12 |     "## <center>Part II</center>\n",
 13 |     "## <center>Reference: Sutton and Barto, Chapter 9-11</center>\n"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "slideshow": {
 20 |      "slide_type": "slide"
 21 |     }
 22 |    },
 23 |    "source": [
 24 |     "## <center>Table of Contents</center>\n",
 25 |     "<br>\n",
 26 |     "\n",
 27 |     "* **Batch Reinforcement Methods**<br><br>\n",
 28 |     "\n",
 29 |     "* **Least Squares Policy Iteration(LSPI)**<br><br>\n",
 30 |     "\n"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {
 36 |     "slideshow": {
 37 |      "slide_type": "slide"
 38 |     }
 39 |    },
 40 |    "source": [
 41 |     "# <center>Batch Reinforcement Methods</center>"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {
 47 |     "slideshow": {
 48 |      "slide_type": "slide"
 49 |     }
 50 |    },
 51 |    "source": [
 52 |     "## <center>Batch Reinforcement Methods</center>\n",
 53 |     "<br>\n",
 54 |     "* Gradient descent is simple and appealing<br><br>\n",
 55 |     "* But it is not sample efficient<br><br>\n",
 56 |     "* Batch methods seek to find the best fitting value function<br><br>\n",
 57 |     "* Given the agent’s experience (“training data”)<br><br>"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "slideshow": {
 64 |      "slide_type": "subslide"
 65 |     }
 66 |    },
 67 |    "source": [
 68 |     "## <center>Least Squares Prediction</center>\n",
 69 |     "\n",
 70 |     "<center><img src=\"img/fa2_slides1.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {
 76 |     "slideshow": {
 77 |      "slide_type": "subslide"
 78 |     }
 79 |    },
 80 |    "source": [
 81 |     "## <center>Stochastic Gradient Descent with Experience Replay</center>\n",
 82 |     "\n",
 83 |     "<center><img src=\"img/fa2_slides3.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {
 89 |     "slideshow": {
 90 |      "slide_type": "subslide"
 91 |     }
 92 |    },
 93 |    "source": [
 94 |     "## <center>Experience Replay in Deep Q-Networks (DQN)</center>\n",
 95 |     "\n",
 96 |     "<center><img src=\"img/fa2_slides4.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {
102 |     "slideshow": {
103 |      "slide_type": "slide"
104 |     }
105 |    },
106 |    "source": [
107 |     "# <center>DQN in ATARI</center>"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {
113 |     "slideshow": {
114 |      "slide_type": "slide"
115 |     }
116 |    },
117 |    "source": [
118 |     "## The model\n",
119 |     "\n",
120 |     "<center><img src=\"img/fa2_ex1.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 700px;\"/></center>"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {
126 |     "slideshow": {
127 |      "slide_type": "subslide"
128 |     }
129 |    },
130 |    "source": [
131 |     "## Performance\n",
132 |     "\n",
133 |     "<center><img src=\"img/fa2_ex2.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 700px;\"/></center>"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {
139 |     "slideshow": {
140 |      "slide_type": "subslide"
141 |     }
142 |    },
143 |    "source": [
144 |     "## Benefits of Experience Replay and Double DQN\n",
145 |     "\n",
146 |     "<center><img src=\"img/fa2_ex3.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 700px;\"/></center>"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {
152 |     "slideshow": {
153 |      "slide_type": "slide"
154 |     }
155 |    },
156 |    "source": [
157 |     "## DQN Example and Code\n",
158 |     "<center><img src=\"img/ex.png\" alt=\"Multi-armed Bandit\" style=\"width: 300px;\"/></center>\n",
159 |     "\n",
160 |     "#### CartPole Example\n",
161 |     "The agent has to decide between two actions - moving the cart left or right - so that the pole attached to it stays upright.\n",
162 |     "\n",
163 |     "##### State Space\n",
164 |     "State is the difference between the current screen patch and the previous one. This will allow the agent to take the velocity of the pole into account from one image."
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {
170 |     "slideshow": {
171 |      "slide_type": "subslide"
172 |     }
173 |    },
174 |    "source": [
175 |     "##### Q-network\n",
176 |     "\n",
177 |     "* Our model will be a convolutional neural network that takes in the difference between the current and previous screen patches. \n",
178 |     "* It has two outputs, representing Q(s,left) and Q(s,right) (where s is the input to the network). \n",
179 |     "* In effect, the network is trying to predict the quality of taking each action given the current input.\n",
180 |     "<center><img src=\"img/2.png\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {
186 |     "slideshow": {
187 |      "slide_type": "subslide"
188 |     }
189 |    },
190 |    "source": [
191 |     "##### Replay Memory\n",
192 |     "* Experience replay memory is used for training the DQN. \n",
193 |     "* It stores the transitions that the agent observes, allowing us to reuse this data later. \n",
194 |     "* By sampling from it randomly, the transitions that build up a batch are decorrelated. \n",
195 |     "* It has been shown that this greatly stabilizes and improves the DQN training procedure.\n",
196 |     "<center><img src=\"img/1.png\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {
202 |     "slideshow": {
203 |      "slide_type": "subslide"
204 |     }
205 |    },
206 |    "source": [
207 |     "##### Input Extraction\n",
208 |     "\n",
209 |     "How do we get the crop of the cart?\n",
210 |     "\n",
211 |     "<center><img src=\"img/3.png\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {
217 |     "slideshow": {
218 |      "slide_type": "subslide"
219 |     }
220 |    },
221 |    "source": [
222 |     "##### Selecting an Action\n",
223 |     "\n",
224 |     "This is done based on $\\epsilon$ greedy policy.\n",
225 |     "<center><img src=\"img/4.png\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {
231 |     "slideshow": {
232 |      "slide_type": "subslide"
233 |     }
234 |    },
235 |    "source": [
236 |     "##### Training\n",
237 |     "<center><img src=\"img/5.png\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>\n",
238 |     "<center><img src=\"img/6.png\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {
244 |     "slideshow": {
245 |      "slide_type": "subslide"
246 |     }
247 |    },
248 |    "source": [
249 |     "<center><img src=\"img/7.png\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {
255 |     "slideshow": {
256 |      "slide_type": "slide"
257 |     }
258 |    },
259 |    "source": [
260 |     "# <center>Linear Least Squares Prediction</center>"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {
266 |     "slideshow": {
267 |      "slide_type": "slide"
268 |     }
269 |    },
270 |    "source": [
271 |     "## <center>Linear Least Squares Prediction</center>\n",
272 |     "<br><br>\n",
273 |     "* Experience replay finds least squares solution<br><br>\n",
274 |     "* But it may take many iterations<br><br>\n",
275 |     "* Using linear value function approximation $\\hat{v}(s, w) = x(s)^Tw$<br><br>\n",
276 |     "* We can solve the least squares solution directly"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {
282 |     "slideshow": {
283 |      "slide_type": "subslide"
284 |     }
285 |    },
286 |    "source": [
287 |     "## <center>Linear Least Squares Prediction</center>\n",
288 |     "\n",
289 |     "<center><img src=\"img/fa2_slides5.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {
295 |     "slideshow": {
296 |      "slide_type": "subslide"
297 |     }
298 |    },
299 |    "source": [
300 |     "## <center>Linear Least Squares Prediction Algorithms</center>\n",
301 |     "\n",
302 |     "<center><img src=\"img/fa2_slides6.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {
309 |     "collapsed": true
310 |    },
311 |    "outputs": [],
312 |    "source": []
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {
317 |     "slideshow": {
318 |      "slide_type": "subslide"
319 |     }
320 |    },
321 |    "source": [
322 |     "## <center>Linear Least Squares Prediction Algorithms</center>\n",
323 |     "\n",
324 |     "<center><img src=\"img/fa2_slides7.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {
330 |     "slideshow": {
331 |      "slide_type": "subslide"
332 |     }
333 |    },
334 |    "source": [
335 |     "## <center>Least Squares Policy Iteration(LSPI)</center>\n",
336 |     "\n",
337 |     "<center><img src=\"img/fa2_slides8.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {
343 |     "slideshow": {
344 |      "slide_type": "subslide"
345 |     }
346 |    },
347 |    "source": [
348 |     "## <center>Least Squares Action-Value Function Approximation</center>\n",
349 |     "\n",
350 |     "<center><img src=\"img/fa2_slides9.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {
356 |     "slideshow": {
357 |      "slide_type": "subslide"
358 |     }
359 |    },
360 |    "source": [
361 |     "## <center>Least Squares Control</center>\n",
362 |     "\n",
363 |     "<center><img src=\"img/fa2_slides10.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {
369 |     "slideshow": {
370 |      "slide_type": "subslide"
371 |     }
372 |    },
373 |    "source": [
374 |     "## <center>Least Squares Q-Learning</center>\n",
375 |     "\n",
376 |     "<center><img src=\"img/fa2_slides11.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {
382 |     "slideshow": {
383 |      "slide_type": "subslide"
384 |     }
385 |    },
386 |    "source": [
387 |     "\n",
388 |     "## <center>Least Squares Policy Iteration(LSPI) Algorithm</center>\n",
389 |     "\n",
390 |     "<center><img src=\"img/fa2_slides12.JPG\" alt=\"Multi-armed Bandit\" style=\"width: 600px;\"/></center>"
391 |    ]
392 |   }
393 |  ],
394 |  "metadata": {
395 |   "anaconda-cloud": {},
396 |   "celltoolbar": "Slideshow",
397 |   "kernelspec": {
398 |    "display_name": "Python [conda root]",
399 |    "language": "python",
400 |    "name": "conda-root-py"
401 |   },
402 |   "language_info": {
403 |    "codemirror_mode": {
404 |     "name": "ipython",
405 |     "version": 3
406 |    },
407 |    "file_extension": ".py",
408 |    "mimetype": "text/x-python",
409 |    "name": "python",
410 |    "nbconvert_exporter": "python",
411 |    "pygments_lexer": "ipython3",
412 |    "version": "3.5.2"
413 |   },
414 |   "widgets": {
415 |    "state": {},
416 |    "version": "1.1.2"
417 |   }
418 |  },
419 |  "nbformat": 4,
420 |  "nbformat_minor": 2
421 | }
422 | 


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/1.png


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/2.png


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/3.png


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/4.png


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/5.png


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/6.png


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/7.png


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/ex.png


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_ex1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_ex1.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_ex2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_ex2.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_ex3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_ex3.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides1.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides10.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides10.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides11.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides11.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides12.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides12.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides2.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides3.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides4.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides5.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides6.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides6.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides7.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides7.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides8.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides8.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/img/fa2_slides9.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides9.JPG


--------------------------------------------------------------------------------
/ch_10_func_approx_2/readme.md:
--------------------------------------------------------------------------------
 1 | # <center>Value Function Approximation (Part II)</center>
 2 | 
 3 | #### <center> Reference: Chapter 9 to Chapter 11, Sutton and Barto</center>
 4 | 
 5 | ## Contents:
 6 | 
 7 | 1) **Batch Reinforcement Methods**
 8 | 	* Least Squares Prediction
 9 | 	* SGD with Experience Replay
10 | 	* Experience Replay in Deep Q-Networks (DQN)
11 | 	* DQN in Atari Games
12 | 
13 | 2) **Linear Least Squares Prediction**
14 | 


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_1.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_10.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_10.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_11.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_11.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_12.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_12.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_13.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_13.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_14.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_14.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_15.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_15.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_16.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_16.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_17.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_17.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_18.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_18.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_19.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_19.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_2.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_20.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_20.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_21.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_21.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_22.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_22.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_23.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_23.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_24.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_24.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_25.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_25.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_26.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_26.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_27.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_27.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_28.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_28.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_29.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_29.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_3.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_30.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_30.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_34.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_34.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_4.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_5.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_6.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_6.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_7.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_7.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_8.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_8.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/pg_9.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_9.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/sutton_1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_1.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/sutton_2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_2.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/sutton_3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_3.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/sutton_4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_4.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/img/sutton_5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_5.JPG


--------------------------------------------------------------------------------
/ch_11_policy_gradient/readme.md:
--------------------------------------------------------------------------------
 1 | # <center>Policy Gradient Methods</center>
 2 | 
 3 | ## Contents:
 4 | 
 5 | 1) **Introduction**
 6 | 	* Policy Gradient vs Value Approximators
 7 | 	* Advantages/Disadvantages
 8 | 
 9 | 2) **REINFORCE: Simplest Policy Gradient Method**
10 | 
11 | 3) **Actor-Critic Methods**
12 | 
13 | 4) **Enhancements to Actor-Critic Method**
14 | 


--------------------------------------------------------------------------------
/ch_1_rl_intro/.ipynb_checkpoints/RL_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# <center>Introduction to Reinforcement Learning</center>\n",
 12 |     "\n",
 13 |     "### <center> Reference: Chapter 1, Sutton and Barto </center> \n"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "slideshow": {
 20 |      "slide_type": "slide"
 21 |     }
 22 |    },
 23 |    "source": [
 24 |     "# <center>Contents</center>\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "slideshow": {
 31 |      "slide_type": "fragment"
 32 |     }
 33 |    },
 34 |    "source": [
 35 |     "* RL: Formal Definition"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {
 41 |     "slideshow": {
 42 |      "slide_type": "fragment"
 43 |     }
 44 |    },
 45 |    "source": [
 46 |     "* RL vs Supervised Learning vs Unsupervised learning"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {
 52 |     "slideshow": {
 53 |      "slide_type": "fragment"
 54 |     }
 55 |    },
 56 |    "source": [
 57 |     "* Important RL Perspectives\n",
 58 |     "    * Goal (Reward Hypothesis)\n",
 59 |     "    * Sequential Decision Making Problem\n",
 60 |     "    * Interaction between Agent and Environment\n",
 61 |     "   "
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {
 67 |     "slideshow": {
 68 |      "slide_type": "fragment"
 69 |     }
 70 |    },
 71 |    "source": [
 72 |     "* Components of RL Agent"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {
 78 |     "slideshow": {
 79 |      "slide_type": "fragment"
 80 |     }
 81 |    },
 82 |    "source": [
 83 |     "* RL Problems: Learning and Planning"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {
 89 |     "slideshow": {
 90 |      "slide_type": "fragment"
 91 |     }
 92 |    },
 93 |    "source": [
 94 |     "* Prediction and Control"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {
100 |     "slideshow": {
101 |      "slide_type": "slide"
102 |     }
103 |    },
104 |    "source": [
105 |     "# <center>RL Formal Definition</center>"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {
111 |     "slideshow": {
112 |      "slide_type": "subslide"
113 |     }
114 |    },
115 |    "source": [
116 |     " \"Reinforcement learning is the problem of getting an agent to act in the world so as to maximize its rewards. For example, consider teaching a dog a new trick: you cannot tell it what to do, but you can reward/punish it if it does the right/wrong thing. It has to figure out what it did that made it get the reward/punishment, which is known as the credit assignment problem.\""
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {
122 |     "slideshow": {
123 |      "slide_type": "subslide"
124 |     }
125 |    },
126 |    "source": [
127 |     "<center><img src=\"img/1.png\" alt=\"Example1\"/></center>"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {
133 |     "slideshow": {
134 |      "slide_type": "subslide"
135 |     }
136 |    },
137 |    "source": [
138 |     "<center><img src=\"img/2.png\" alt=\"Example2\"/></center>"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {
144 |     "slideshow": {
145 |      "slide_type": "slide"
146 |     }
147 |    },
148 |    "source": [
149 |     "# <center>RL vs Supervised Learning vs Unsupervised Learning</center>"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {
155 |     "slideshow": {
156 |      "slide_type": "subslide"
157 |     }
158 |    },
159 |    "source": [
160 |     "## Supervised Learning\n",
161 |     "\n",
162 |     "1) A human builds a classifier based on input and output data\n",
163 |     "\n",
164 |     "2) That classifier is trained with a training set of data\n",
165 |     "\n",
166 |     "3) That classifier is tested with a test set of data\n",
167 |     "\n",
168 |     "4) Deployment if the output is satisfactory\n",
169 |     "\n",
170 |     "To be used when, \"I know how to classify this data, I just need you(the classifier) to sort it.\"\n",
171 |     "\n",
172 |     "Point of method: To class labels or to produce real numbers"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {
178 |     "slideshow": {
179 |      "slide_type": "subslide"
180 |     }
181 |    },
182 |    "source": [
183 |     "## Unsupervised Learning\n",
184 |     "\n",
185 |     "\n",
186 |     "1) A human builds an algorithm based on input data\n",
187 |     "\n",
188 |     "2) That algorithm is tested with a test set of data (in which the algorithm creates the classifier)\n",
189 |     "\n",
190 |     "3) Deployment if the classifier is satisfactory\n",
191 |     "\n",
192 |     "To be used when, \"I have no idea how to classify this data, can you(the algorithm) create a classifier for me?\"\n",
193 |     "Point of method: To class labels or to predict"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {
199 |     "slideshow": {
200 |      "slide_type": "subslide"
201 |     }
202 |    },
203 |    "source": [
204 |     "## Reinforcement Learning\n",
205 |     "\n",
206 |     "\n",
207 |     "1) A human builds an algorithm based on input data\n",
208 |     "\n",
209 |     "2) That algorithm presents a state dependent on the input data in which a user rewards or punishes the algorithm via the action the algorithm took, this continues over time\n",
210 |     "\n",
211 |     "3) That algorithm learns from the reward/punishment and updates itself, this continues\n",
212 |     "\n",
213 |     "4) It's always in production, it needs to learn real data to be able to present actions from states\n",
214 |     "\n",
215 |     "To be used when, \"I have no idea how to classify this data, can you classify this data and I'll give you a reward if it's correct or I'll punish you if it's not.\"\n"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {
221 |     "slideshow": {
222 |      "slide_type": "subslide"
223 |     }
224 |    },
225 |    "source": [
226 |     "### RL vs Supervised Learning\n",
227 |     "* Training Examples\n",
228 |     "    * Supervised Learning: Training Examples of the formNo training examples from a knowledgeable external supervisor (situation together with a label).\n",
229 |     "    * RL: No such training examples.\n",
230 |     "* Objective Functions\n",
231 |     "    * Supervised Learning: Aim is to extrapolate, or generalize so that it acts correctly in situations not present in the training set. \n",
232 |     "    * In RL, it is often impractical to obtain examples of desired behavior that are both correct and representative of all the situations and an agent must be able to learn from its own experience."
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {
238 |     "slideshow": {
239 |      "slide_type": "subslide"
240 |     }
241 |    },
242 |    "source": [
243 |     "### RL vs Unsupervised Learning\n",
244 |     "* Unsupervised Learning is about finding structure hidden in collections of unlabeled data.\n",
245 |     "* Uncovering structure in an agent’s experience can certainly be useful in reinforcement learning, but by itself does not address the reinforcement learning agent’s problem of maximizing a reward signal."
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {
251 |     "slideshow": {
252 |      "slide_type": "subslide"
253 |     }
254 |    },
255 |    "source": [
256 |     "### Examples of RL\n",
257 |     "* Fly stunt manoeuvres in a helicopter\n",
258 |     "* Defeat the world champion at Backgammon\n",
259 |     "* Manage an investment portfolio\n",
260 |     "* Control a power station\n",
261 |     "* Make a humanoid robot walk\n",
262 |     "* Play many different Atari games better than humans"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {
268 |     "slideshow": {
269 |      "slide_type": "slide"
270 |     }
271 |    },
272 |    "source": [
273 |     "# <center>Important RL Perspectives</center>"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {
279 |     "slideshow": {
280 |      "slide_type": "slide"
281 |     }
282 |    },
283 |    "source": [
284 |     "## Goal of RL (Reward Hypothesis)\n",
285 |     "<center><img src=\"img/3.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {
291 |     "slideshow": {
292 |      "slide_type": "subslide"
293 |     }
294 |    },
295 |    "source": [
296 |     "## Reward Examples\n",
297 |     "* stunt manoeuvres in a helicopter\n",
298 |     "    * +ve reward for following desired trajectory\n",
299 |     "    * −ve reward for crashing\n",
300 |     "* Defeat the world champion at Backgammon\n",
301 |     "    * +/−ve reward for winning/losing a game\n",
302 |     "* Manage an investment portfolio\n",
303 |     "    * +ve reward for each dollar in bank\n",
304 |     "* Control a power station\n",
305 |     "    * +ve reward for producing power\n",
306 |     "    * −ve reward for exceeding safety thresholds\n",
307 |     "* Make a humanoid robot walk\n",
308 |     "    * +ve reward for forward motion\n",
309 |     "    * −ve reward for falling over\n",
310 |     "* Play many different Atari games better than humans\n",
311 |     "    * +/−ve reward for increasing/decreasing score"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {
317 |     "slideshow": {
318 |      "slide_type": "slide"
319 |     }
320 |    },
321 |    "source": [
322 |     "## Sequential Decision Making Problem\n",
323 |     "* Goal: select actions to maximise total future reward\n",
324 |     "* Actions may have long term consequences\n",
325 |     "* Reward may be delayed\n",
326 |     "* It may be better to sacrifice immediate reward to gain more long-term reward\n",
327 |     "* Examples:\n",
328 |     "    * A financial investment (may take months to mature)\n",
329 |     "    * Refuelling a helicopter (might prevent a crash in several hours)\n",
330 |     "    * Blocking opponent moves (might help winning chances many moves from now)\n"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {
336 |     "slideshow": {
337 |      "slide_type": "slide"
338 |     }
339 |    },
340 |    "source": [
341 |     "## Interaction between Agent and Environment\n",
342 |     "<center><img src=\"img/6.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {
348 |     "slideshow": {
349 |      "slide_type": "subslide"
350 |     }
351 |    },
352 |    "source": [
353 |     "## Interaction between Agent and Environment\n",
354 |     "<center><img src=\"img/7.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {
360 |     "slideshow": {
361 |      "slide_type": "subslide"
362 |     }
363 |    },
364 |    "source": [
365 |     "## History and State\n",
366 |     "<center><img src=\"img/8.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {
372 |     "slideshow": {
373 |      "slide_type": "subslide"
374 |     }
375 |    },
376 |    "source": [
377 |     "## Environment State\n",
378 |     "<center><img src=\"img/9.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {
384 |     "slideshow": {
385 |      "slide_type": "subslide"
386 |     }
387 |    },
388 |    "source": [
389 |     "## Agent State\n",
390 |     "<center><img src=\"img/10.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {
396 |     "slideshow": {
397 |      "slide_type": "subslide"
398 |     }
399 |    },
400 |    "source": [
401 |     "## Information State\n",
402 |     "<center><img src=\"img/11.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {
408 |     "slideshow": {
409 |      "slide_type": "subslide"
410 |     }
411 |    },
412 |    "source": [
413 |     "## Fully Observable Environment\n",
414 |     "<center><img src=\"img/12.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {
420 |     "slideshow": {
421 |      "slide_type": "subslide"
422 |     }
423 |    },
424 |    "source": [
425 |     "## Partially Observable Environment\n",
426 |     "<center><img src=\"img/13.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {
432 |     "slideshow": {
433 |      "slide_type": "slide"
434 |     }
435 |    },
436 |    "source": [
437 |     "## Major Components of a RL Agent\n",
438 |     "An RL agent may include one or more of these components:\n",
439 |     "* **Policy**: agent’s behaviour function\n",
440 |     "* **Value function**: how good is each state and/or action\n",
441 |     "* **Model**: agent’s representation of the environment"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {
447 |     "slideshow": {
448 |      "slide_type": "subslide"
449 |     }
450 |    },
451 |    "source": [
452 |     "## Policy\n",
453 |     "* A policy is the agent’s behaviour\n",
454 |     "* It is a map from state to action, e.g.\n",
455 |     "* Deterministic policy: $a = π(s)$\n",
456 |     "* Stochastic policy: $π(a|s) = P[A_t = a|S_t = s]$"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "markdown",
461 |    "metadata": {
462 |     "slideshow": {
463 |      "slide_type": "subslide"
464 |     }
465 |    },
466 |    "source": [
467 |     "## Value function\n",
468 |     "* Value function is a prediction of future reward\n",
469 |     "* Used to evaluate the goodness/badness of states\n",
470 |     "* And therefore to select between actions, e.g.\n",
471 |     "$$v_π(s) = E_π[R _{t+1} + γ*R_{t+2} + γ^{2}*R_{t+3} + ... | S_t = s]$$"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "markdown",
476 |    "metadata": {
477 |     "slideshow": {
478 |      "slide_type": "subslide"
479 |     }
480 |    },
481 |    "source": [
482 |     "## Model\n",
483 |     "* A model predicts what the environment will do next\n",
484 |     "* **Transitions**: P predicts the next state (i.e. dynamics)\n",
485 |     "* **Rewards**: R predicts the next (immediate) reward, e.g.\n",
486 |     "$$ P_{ss'} = P[S_{t+1} = s' | S_t = s, A_t = a]$$\n",
487 |     "$$ R^{a}_{s} = E[R_{t+1} | S_t = s, A_t = a]$$"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "markdown",
492 |    "metadata": {
493 |     "slideshow": {
494 |      "slide_type": "subslide"
495 |     }
496 |    },
497 |    "source": [
498 |     "## Maze Example\n",
499 |     "<center><img src=\"img/e1.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>\n"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {
505 |     "slideshow": {
506 |      "slide_type": "subslide"
507 |     }
508 |    },
509 |    "source": [
510 |     "# Maze Example: Policy\n",
511 |     "<center><img src=\"img/e2.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>\n",
512 |     "\n",
513 |     "* Arrows represent policy $\\pi(s)$ for each state s. "
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "markdown",
518 |    "metadata": {
519 |     "slideshow": {
520 |      "slide_type": "subslide"
521 |     }
522 |    },
523 |    "source": [
524 |     "# Maze Example: Value Function\n",
525 |     "<center><img src=\"img/e3.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "markdown",
530 |    "metadata": {
531 |     "slideshow": {
532 |      "slide_type": "subslide"
533 |     }
534 |    },
535 |    "source": [
536 |     "# Maze Example: Model\n",
537 |     "<center><img src=\"img/e4.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "markdown",
542 |    "metadata": {
543 |     "slideshow": {
544 |      "slide_type": "slide"
545 |     }
546 |    },
547 |    "source": [
548 |     "## Learning and Planning\n",
549 |     "Two fundamental problems in sequential decision making:\n",
550 |     "* Reinforcement Learning:\n",
551 |     "    * The environment is initially unknown\n",
552 |     "    * The agent interacts with the environment\n",
553 |     "    * The agent improves its policy\n",
554 |     "* Planning:\n",
555 |     "    * A model of the environment is known\n",
556 |     "    * The agent performs computations with its model (without any external interaction)\n",
557 |     "    * The agent improves its policy a.k.a. deliberation, reasoning, introspection, pondering, thought, search"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": null,
563 |    "metadata": {
564 |     "collapsed": true,
565 |     "slideshow": {
566 |      "slide_type": "slide"
567 |     }
568 |    },
569 |    "outputs": [],
570 |    "source": [
571 |     "## Exploration and Exploitation\n",
572 |     "\n",
573 |     "* Reinforcement learning is like trial-and-error learning\n",
574 |     "\n",
575 |     "* The agent should discover a good policy,\n",
576 |     "  * From its experiences of the environment,\n",
577 |     "  * Without losing too much reward along the way\n",
578 |     "\n",
579 |     "* **Exploration** finds more information about the environment\n",
580 |     "\n",
581 |     "* **Exploitation** exploits known information to maximise reward\n",
582 |     "\n",
583 |     "* It is usually important to explore as well as exploit (In Detail => Chapter 2)"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "markdown",
588 |    "metadata": {
589 |     "slideshow": {
590 |      "slide_type": "subslide"
591 |     }
592 |    },
593 |    "source": [
594 |     "## Examples of Explortion and Exploitation\n",
595 |     "\n",
596 |     "* Restaurant Selection\n",
597 |     "    * Exploitation Go to your favourite restaurant\n",
598 |     "    * Exploration Try a new restaurant\n",
599 |     "* Online Banner Advertisements\n",
600 |     "    * Exploitation Show the most successful advert\n",
601 |     "    * Exploration Show a different advert\n",
602 |     "* Oil Drilling\n",
603 |     "    * Exploitation Drill at the best known location\n",
604 |     "    * Exploration Drill at a new location\n",
605 |     "* Game Playing\n",
606 |     "    * Exploitation Play the move you believe is best\n",
607 |     "    * Exploration Play an experimental move"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "markdown",
612 |    "metadata": {
613 |     "slideshow": {
614 |      "slide_type": "slide"
615 |     }
616 |    },
617 |    "source": [
618 |     "## Summary\n",
619 |     "\n",
620 |     "* We got introduced to the basic terminologies of RL.\n",
621 |     "\n",
622 |     "\n",
623 |     "* We got an intuition behind how an RL agent can solve problems.\n",
624 |     "\n"
625 |    ]
626 |   }
627 |  ],
628 |  "metadata": {
629 |   "celltoolbar": "Slideshow",
630 |   "kernelspec": {
631 |    "display_name": "Python 2",
632 |    "language": "python",
633 |    "name": "python2"
634 |   },
635 |   "language_info": {
636 |    "codemirror_mode": {
637 |     "name": "ipython",
638 |     "version": 2
639 |    },
640 |    "file_extension": ".py",
641 |    "mimetype": "text/x-python",
642 |    "name": "python",
643 |    "nbconvert_exporter": "python",
644 |    "pygments_lexer": "ipython2",
645 |    "version": "2.7.12"
646 |   }
647 |  },
648 |  "nbformat": 4,
649 |  "nbformat_minor": 2
650 | }
651 | 


--------------------------------------------------------------------------------
/ch_1_rl_intro/.ipynb_checkpoints/readme-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Introduction to Reinforcement Learning"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "### Reference: Chapter 1, Sutton and Barto"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "markdown",
19 |    "metadata": {},
20 |    "source": [
21 |     "## Contents:"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "markdown",
26 |    "metadata": {},
27 |    "source": [
28 |     "1) **RL: Formal Definition**\n",
29 |     "* Recent Examples\n",
30 |     "\n",
31 |     "2) **RL vs Supervised Learning vs Unsupervised learning**\n",
32 |     "\n",
33 |     "3) **Important RL Perspectives**\n",
34 |     "* Goal (Reward Hypothesis)\n",
35 |     "* Sequential Decision Making Problem\n",
36 |     "* Interaction between Agent and Environment\n",
37 |     "\n",
38 |     "4) **Components of RL Agent**\n",
39 |     "* Policy\n",
40 |     "* Value Function\n",
41 |     "* Model\n",
42 |     "\n",
43 |     "5) **RL Problems: Learning and Planning**\n",
44 |     "\n",
45 |     "6) **Exploration vs Exploitation**\n",
46 |     "\n",
47 |     "7) **Prediction and Control**\n"
48 |    ]
49 |   },
50 |   {
51 |    "cell_type": "markdown",
52 |    "metadata": {},
53 |    "source": [
54 |     "## Summary"
55 |    ]
56 |   },
57 |   {
58 |    "cell_type": "markdown",
59 |    "metadata": {},
60 |    "source": [
61 |     "* We got introduced to the basic terminologies of RL.\n",
62 |     "\n",
63 |     "\n",
64 |     "* We saw how Reinforcement learning is different from other forms of learning.\n",
65 |     "\n"
66 |    ]
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 2",
72 |    "language": "python",
73 |    "name": "python2"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 2
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython2",
85 |    "version": "2.7.12"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 2
90 | }
91 | 


--------------------------------------------------------------------------------
/ch_1_rl_intro/RL_1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# <center>Introduction to Reinforcement Learning</center>\n",
 12 |     "\n",
 13 |     "### <center> Reference: Chapter 1, Sutton and Barto </center> \n"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "slideshow": {
 20 |      "slide_type": "slide"
 21 |     }
 22 |    },
 23 |    "source": [
 24 |     "# <center>Contents</center>\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "slideshow": {
 31 |      "slide_type": "fragment"
 32 |     }
 33 |    },
 34 |    "source": [
 35 |     "* RL: Formal Definition"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {
 41 |     "slideshow": {
 42 |      "slide_type": "fragment"
 43 |     }
 44 |    },
 45 |    "source": [
 46 |     "* RL vs Supervised Learning vs Unsupervised learning"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {
 52 |     "slideshow": {
 53 |      "slide_type": "fragment"
 54 |     }
 55 |    },
 56 |    "source": [
 57 |     "* Important RL Perspectives\n",
 58 |     "    * Goal (Reward Hypothesis)\n",
 59 |     "    * Sequential Decision Making Problem\n",
 60 |     "    * Interaction between Agent and Environment\n",
 61 |     "   "
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {
 67 |     "slideshow": {
 68 |      "slide_type": "fragment"
 69 |     }
 70 |    },
 71 |    "source": [
 72 |     "* Components of RL Agent"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {
 78 |     "slideshow": {
 79 |      "slide_type": "fragment"
 80 |     }
 81 |    },
 82 |    "source": [
 83 |     "* RL Problems: Learning and Planning"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {
 89 |     "slideshow": {
 90 |      "slide_type": "fragment"
 91 |     }
 92 |    },
 93 |    "source": [
 94 |     "* Prediction and Control"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {
100 |     "slideshow": {
101 |      "slide_type": "slide"
102 |     }
103 |    },
104 |    "source": [
105 |     "# <center>RL Formal Definition</center>"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {
111 |     "slideshow": {
112 |      "slide_type": "subslide"
113 |     }
114 |    },
115 |    "source": [
116 |     " \"Reinforcement learning is the problem of getting an agent to act in the world so as to maximize its rewards. For example, consider teaching a dog a new trick: you cannot tell it what to do, but you can reward/punish it if it does the right/wrong thing. It has to figure out what it did that made it get the reward/punishment, which is known as the credit assignment problem.\""
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {
122 |     "slideshow": {
123 |      "slide_type": "subslide"
124 |     }
125 |    },
126 |    "source": [
127 |     "<center><img src=\"img/1.png\" alt=\"Example1\"/></center>"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {
133 |     "slideshow": {
134 |      "slide_type": "subslide"
135 |     }
136 |    },
137 |    "source": [
138 |     "<center><img src=\"img/2.png\" alt=\"Example2\"/></center>"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {
144 |     "slideshow": {
145 |      "slide_type": "slide"
146 |     }
147 |    },
148 |    "source": [
149 |     "# <center>RL vs Supervised Learning vs Unsupervised Learning</center>"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {
155 |     "slideshow": {
156 |      "slide_type": "subslide"
157 |     }
158 |    },
159 |    "source": [
160 |     "## Supervised Learning\n",
161 |     "\n",
162 |     "1) A human builds a classifier based on input and output data\n",
163 |     "\n",
164 |     "2) That classifier is trained with a training set of data\n",
165 |     "\n",
166 |     "3) That classifier is tested with a test set of data\n",
167 |     "\n",
168 |     "4) Deployment if the output is satisfactory\n",
169 |     "\n",
170 |     "To be used when, \"I know how to classify this data, I just need you(the classifier) to sort it.\"\n",
171 |     "\n",
172 |     "Point of method: To class labels or to produce real numbers"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {
178 |     "slideshow": {
179 |      "slide_type": "subslide"
180 |     }
181 |    },
182 |    "source": [
183 |     "## Unsupervised Learning\n",
184 |     "\n",
185 |     "\n",
186 |     "1) A human builds an algorithm based on input data\n",
187 |     "\n",
188 |     "2) That algorithm is tested with a test set of data (in which the algorithm creates the classifier)\n",
189 |     "\n",
190 |     "3) Deployment if the classifier is satisfactory\n",
191 |     "\n",
192 |     "To be used when, \"I have no idea how to classify this data, can you(the algorithm) create a classifier for me?\"\n",
193 |     "Point of method: To class labels or to predict"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {
199 |     "slideshow": {
200 |      "slide_type": "subslide"
201 |     }
202 |    },
203 |    "source": [
204 |     "## Reinforcement Learning\n",
205 |     "\n",
206 |     "\n",
207 |     "1) A human builds an algorithm based on input data\n",
208 |     "\n",
209 |     "2) That algorithm presents a state dependent on the input data in which a user rewards or punishes the algorithm via the action the algorithm took, this continues over time\n",
210 |     "\n",
211 |     "3) That algorithm learns from the reward/punishment and updates itself, this continues\n",
212 |     "\n",
213 |     "4) It's always in production, it needs to learn real data to be able to present actions from states\n",
214 |     "\n",
215 |     "To be used when, \"I have no idea how to classify this data, can you classify this data and I'll give you a reward if it's correct or I'll punish you if it's not.\"\n"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {
221 |     "slideshow": {
222 |      "slide_type": "subslide"
223 |     }
224 |    },
225 |    "source": [
226 |     "### RL vs Supervised Learning\n",
227 |     "* Training Examples\n",
228 |     "    * Supervised Learning: Training Examples of the formNo training examples from a knowledgeable external supervisor (situation together with a label).\n",
229 |     "    * RL: No such training examples.\n",
230 |     "* Objective Functions\n",
231 |     "    * Supervised Learning: Aim is to extrapolate, or generalize so that it acts correctly in situations not present in the training set. \n",
232 |     "    * In RL, it is often impractical to obtain examples of desired behavior that are both correct and representative of all the situations and an agent must be able to learn from its own experience."
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {
238 |     "slideshow": {
239 |      "slide_type": "subslide"
240 |     }
241 |    },
242 |    "source": [
243 |     "### RL vs Unsupervised Learning\n",
244 |     "* Unsupervised Learning is about finding structure hidden in collections of unlabeled data.\n",
245 |     "* Uncovering structure in an agent’s experience can certainly be useful in reinforcement learning, but by itself does not address the reinforcement learning agent’s problem of maximizing a reward signal."
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {
251 |     "slideshow": {
252 |      "slide_type": "subslide"
253 |     }
254 |    },
255 |    "source": [
256 |     "### Examples of RL\n",
257 |     "* Fly stunt manoeuvres in a helicopter\n",
258 |     "* Defeat the world champion at Backgammon\n",
259 |     "* Manage an investment portfolio\n",
260 |     "* Control a power station\n",
261 |     "* Make a humanoid robot walk\n",
262 |     "* Play many different Atari games better than humans"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {
268 |     "slideshow": {
269 |      "slide_type": "slide"
270 |     }
271 |    },
272 |    "source": [
273 |     "# <center>Important RL Perspectives</center>"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {
279 |     "slideshow": {
280 |      "slide_type": "slide"
281 |     }
282 |    },
283 |    "source": [
284 |     "## Goal of RL (Reward Hypothesis)\n",
285 |     "<center><img src=\"img/3.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {
291 |     "slideshow": {
292 |      "slide_type": "subslide"
293 |     }
294 |    },
295 |    "source": [
296 |     "## Reward Examples\n",
297 |     "* stunt manoeuvres in a helicopter\n",
298 |     "    * +ve reward for following desired trajectory\n",
299 |     "    * −ve reward for crashing\n",
300 |     "* Defeat the world champion at Backgammon\n",
301 |     "    * +/−ve reward for winning/losing a game\n",
302 |     "* Manage an investment portfolio\n",
303 |     "    * +ve reward for each dollar in bank\n",
304 |     "* Control a power station\n",
305 |     "    * +ve reward for producing power\n",
306 |     "    * −ve reward for exceeding safety thresholds\n",
307 |     "* Make a humanoid robot walk\n",
308 |     "    * +ve reward for forward motion\n",
309 |     "    * −ve reward for falling over\n",
310 |     "* Play many different Atari games better than humans\n",
311 |     "    * +/−ve reward for increasing/decreasing score"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {
317 |     "slideshow": {
318 |      "slide_type": "slide"
319 |     }
320 |    },
321 |    "source": [
322 |     "## Sequential Decision Making Problem\n",
323 |     "* Goal: select actions to maximise total future reward\n",
324 |     "* Actions may have long term consequences\n",
325 |     "* Reward may be delayed\n",
326 |     "* It may be better to sacrifice immediate reward to gain more long-term reward\n",
327 |     "* Examples:\n",
328 |     "    * A financial investment (may take months to mature)\n",
329 |     "    * Refuelling a helicopter (might prevent a crash in several hours)\n",
330 |     "    * Blocking opponent moves (might help winning chances many moves from now)\n"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {
336 |     "slideshow": {
337 |      "slide_type": "slide"
338 |     }
339 |    },
340 |    "source": [
341 |     "## Interaction between Agent and Environment\n",
342 |     "<center><img src=\"img/6.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {
348 |     "slideshow": {
349 |      "slide_type": "subslide"
350 |     }
351 |    },
352 |    "source": [
353 |     "## Interaction between Agent and Environment\n",
354 |     "<center><img src=\"img/7.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {
360 |     "slideshow": {
361 |      "slide_type": "subslide"
362 |     }
363 |    },
364 |    "source": [
365 |     "## History and State\n",
366 |     "<center><img src=\"img/8.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {
372 |     "slideshow": {
373 |      "slide_type": "subslide"
374 |     }
375 |    },
376 |    "source": [
377 |     "## Environment State\n",
378 |     "<center><img src=\"img/9.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {
384 |     "slideshow": {
385 |      "slide_type": "subslide"
386 |     }
387 |    },
388 |    "source": [
389 |     "## Agent State\n",
390 |     "<center><img src=\"img/10.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {
396 |     "slideshow": {
397 |      "slide_type": "subslide"
398 |     }
399 |    },
400 |    "source": [
401 |     "## Information State\n",
402 |     "<center><img src=\"img/11.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {
408 |     "slideshow": {
409 |      "slide_type": "subslide"
410 |     }
411 |    },
412 |    "source": [
413 |     "## Fully Observable Environment\n",
414 |     "<center><img src=\"img/12.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {
420 |     "slideshow": {
421 |      "slide_type": "subslide"
422 |     }
423 |    },
424 |    "source": [
425 |     "## Partially Observable Environment\n",
426 |     "<center><img src=\"img/13.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {
432 |     "slideshow": {
433 |      "slide_type": "slide"
434 |     }
435 |    },
436 |    "source": [
437 |     "## Major Components of a RL Agent\n",
438 |     "An RL agent may include one or more of these components:\n",
439 |     "* **Policy**: agent’s behaviour function\n",
440 |     "* **Value function**: how good is each state and/or action\n",
441 |     "* **Model**: agent’s representation of the environment"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {
447 |     "slideshow": {
448 |      "slide_type": "subslide"
449 |     }
450 |    },
451 |    "source": [
452 |     "## Policy\n",
453 |     "* A policy is the agent’s behaviour\n",
454 |     "* It is a map from state to action, e.g.\n",
455 |     "* Deterministic policy: $a = π(s)$\n",
456 |     "* Stochastic policy: $π(a|s) = P[A_t = a|S_t = s]$"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "markdown",
461 |    "metadata": {
462 |     "slideshow": {
463 |      "slide_type": "subslide"
464 |     }
465 |    },
466 |    "source": [
467 |     "## Value function\n",
468 |     "* Value function is a prediction of future reward\n",
469 |     "* Used to evaluate the goodness/badness of states\n",
470 |     "* And therefore to select between actions, e.g.\n",
471 |     "$$v_π(s) = E_π[R _{t+1} + γ*R_{t+2} + γ^{2}*R_{t+3} + ... | S_t = s]$$"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "markdown",
476 |    "metadata": {
477 |     "slideshow": {
478 |      "slide_type": "subslide"
479 |     }
480 |    },
481 |    "source": [
482 |     "## Model\n",
483 |     "* A model predicts what the environment will do next\n",
484 |     "* **Transitions**: P predicts the next state (i.e. dynamics)\n",
485 |     "* **Rewards**: R predicts the next (immediate) reward, e.g.\n",
486 |     "$$ P_{ss'} = P[S_{t+1} = s' | S_t = s, A_t = a]$$\n",
487 |     "$$ R^{a}_{s} = E[R_{t+1} | S_t = s, A_t = a]$$"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "markdown",
492 |    "metadata": {
493 |     "slideshow": {
494 |      "slide_type": "subslide"
495 |     }
496 |    },
497 |    "source": [
498 |     "## Maze Example\n",
499 |     "<center><img src=\"img/e1.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>\n"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {
505 |     "slideshow": {
506 |      "slide_type": "subslide"
507 |     }
508 |    },
509 |    "source": [
510 |     "# Maze Example: Policy\n",
511 |     "<center><img src=\"img/e2.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>\n",
512 |     "\n",
513 |     "* Arrows represent policy $\\pi(s)$ for each state s. "
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "markdown",
518 |    "metadata": {
519 |     "slideshow": {
520 |      "slide_type": "subslide"
521 |     }
522 |    },
523 |    "source": [
524 |     "# Maze Example: Value Function\n",
525 |     "<center><img src=\"img/e3.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "markdown",
530 |    "metadata": {
531 |     "slideshow": {
532 |      "slide_type": "subslide"
533 |     }
534 |    },
535 |    "source": [
536 |     "# Maze Example: Model\n",
537 |     "<center><img src=\"img/e4.png\" alt=\"HistoryandState\" style=\"width: 1000px;\"/></center>"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "markdown",
542 |    "metadata": {
543 |     "slideshow": {
544 |      "slide_type": "slide"
545 |     }
546 |    },
547 |    "source": [
548 |     "## Learning and Planning\n",
549 |     "Two fundamental problems in sequential decision making:\n",
550 |     "* Reinforcement Learning:\n",
551 |     "    * The environment is initially unknown\n",
552 |     "    * The agent interacts with the environment\n",
553 |     "    * The agent improves its policy\n",
554 |     "* Planning:\n",
555 |     "    * A model of the environment is known\n",
556 |     "    * The agent performs computations with its model (without any external interaction)\n",
557 |     "    * The agent improves its policy a.k.a. deliberation, reasoning, introspection, pondering, thought, search"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "markdown",
562 |    "metadata": {
563 |     "slideshow": {
564 |      "slide_type": "slide"
565 |     }
566 |    },
567 |    "source": [
568 |     "## Exploration and Exploitation\n",
569 |     "\n",
570 |     "* Reinforcement learning is like trial-and-error learning\n",
571 |     "\n",
572 |     "\n",
573 |     "* The agent should discover a good policy,\n",
574 |     "  * From its experiences of the environment,\n",
575 |     "  * Without losing too much reward along the way\n",
576 |     "\n",
577 |     "\n",
578 |     "* **Exploration** finds more information about the environment\n",
579 |     "\n",
580 |     "\n",
581 |     "* **Exploitation** exploits known information to maximise reward\n",
582 |     "\n",
583 |     "\n",
584 |     "* It is usually important to explore as well as exploit (In Detail => Chapter 2)."
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "markdown",
589 |    "metadata": {
590 |     "slideshow": {
591 |      "slide_type": "subslide"
592 |     }
593 |    },
594 |    "source": [
595 |     "## Examples of Explortion and Exploitation\n",
596 |     "\n",
597 |     "* Restaurant Selection\n",
598 |     "    * Exploitation Go to your favourite restaurant\n",
599 |     "    * Exploration Try a new restaurant\n",
600 |     "* Online Banner Advertisements\n",
601 |     "    * Exploitation Show the most successful advert\n",
602 |     "    * Exploration Show a different advert\n",
603 |     "* Oil Drilling\n",
604 |     "    * Exploitation Drill at the best known location\n",
605 |     "    * Exploration Drill at a new location\n",
606 |     "* Game Playing\n",
607 |     "    * Exploitation Play the move you believe is best\n",
608 |     "    * Exploration Play an experimental move"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "markdown",
613 |    "metadata": {
614 |     "slideshow": {
615 |      "slide_type": "slide"
616 |     }
617 |    },
618 |    "source": [
619 |     "## Summary\n",
620 |     "\n",
621 |     "* We got introduced to the basic terminologies of RL.\n",
622 |     "\n",
623 |     "\n",
624 |     "* We got an intuition behind how an RL agent can solve problems.\n",
625 |     "\n"
626 |    ]
627 |   }
628 |  ],
629 |  "metadata": {
630 |   "celltoolbar": "Slideshow",
631 |   "kernelspec": {
632 |    "display_name": "Python 2",
633 |    "language": "python",
634 |    "name": "python2"
635 |   },
636 |   "language_info": {
637 |    "codemirror_mode": {
638 |     "name": "ipython",
639 |     "version": 2
640 |    },
641 |    "file_extension": ".py",
642 |    "mimetype": "text/x-python",
643 |    "name": "python",
644 |    "nbconvert_exporter": "python",
645 |    "pygments_lexer": "ipython2",
646 |    "version": "2.7.12"
647 |   }
648 |  },
649 |  "nbformat": 4,
650 |  "nbformat_minor": 2
651 | }
652 | 


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/1.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/10.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/11.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/12.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/13.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/2.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/3.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/6.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/7.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/8.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/9.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/e1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/e1.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/e2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/e2.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/e3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/e3.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/img/e4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/e4.png


--------------------------------------------------------------------------------
/ch_1_rl_intro/readme.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Introduction to Reinforcement Learning"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "### Reference: Chapter 1, Sutton and Barto"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "markdown",
19 |    "metadata": {},
20 |    "source": [
21 |     "## Contents:"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "markdown",
26 |    "metadata": {},
27 |    "source": [
28 |     "1) **RL: Formal Definition**\n",
29 |     "* Recent Examples\n",
30 |     "\n",
31 |     "2) **RL vs Supervised Learning vs Unsupervised learning**\n",
32 |     "\n",
33 |     "3) **Important RL Perspectives**\n",
34 |     "* Goal (Reward Hypothesis)\n",
35 |     "* Sequential Decision Making Problem\n",
36 |     "* Interaction between Agent and Environment\n",
37 |     "\n",
38 |     "4) **Components of RL Agent**\n",
39 |     "* Policy\n",
40 |     "* Value Function\n",
41 |     "* Model\n",
42 |     "\n",
43 |     "5) **RL Problems: Learning and Planning**\n",
44 |     "\n",
45 |     "6) **Exploration vs Exploitation**\n",
46 |     "\n",
47 |     "7) **Prediction and Control**\n"
48 |    ]
49 |   },
50 |   {
51 |    "cell_type": "markdown",
52 |    "metadata": {},
53 |    "source": [
54 |     "## Summary"
55 |    ]
56 |   },
57 |   {
58 |    "cell_type": "markdown",
59 |    "metadata": {},
60 |    "source": [
61 |     "* We got introduced to the basic terminologies of RL.\n",
62 |     "\n",
63 |     "\n",
64 |     "* We saw how Reinforcement learning is different from other forms of learning.\n",
65 |     "\n"
66 |    ]
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 2",
72 |    "language": "python",
73 |    "name": "python2"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 2
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython2",
85 |    "version": "2.7.12"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 2
90 | }
91 | 


--------------------------------------------------------------------------------
/ch_1_rl_intro/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Introduction to Reinforcement Learning
 3 | 
 4 | ### Reference: Chapter 1, Sutton and Barto
 5 | 
 6 | ## Contents:
 7 | 
 8 | 1) **RL: Formal Definition**
 9 | 	* Recent Examples
10 | 
11 | 2) **RL vs Supervised Learning vs Unsupervised learning**
12 | 
13 | 3) **Important RL Perspectives**
14 | 	* Goal (Reward Hypothesis)
15 | 	* Sequential Decision Making Problem
16 | 	* Interaction between Agent and Environment
17 | 
18 | 4) **Components of RL Agent**
19 | 	* Policy
20 | 	* Value Function
21 | 	* Model
22 | 
23 | 5) **RL Problems: Learning and Planning**
24 | 
25 | 6) **Exploration vs Exploitation**
26 | 
27 | 7) **Prediction and Control**
28 | 
29 | 


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/UCB.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/UCB.JPG


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/com_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/com_2.jpg


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/grad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/grad.jpg


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/greedyvs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/greedyvs.jpg


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/mistake.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/mistake.JPG


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/mistake_1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/mistake_1.JPG


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/mistake_2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/mistake_2.JPG


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/multiarmedbandit.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/multiarmedbandit.jpg


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/optinit.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/optinit.JPG


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/img/testbed.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/testbed.JPG


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/readme.md:
--------------------------------------------------------------------------------
 1 | # <center>Reinforcement Learning in Non-Associative setting</center>
 2 | 
 3 | #### <center> Reference: Chapter 2, Sutton and Barto</center>
 4 | 
 5 | ## Contents:
 6 | 
 7 | 1) **Introduction**
 8 | 	* Non-Associative Setting?
 9 | 	* Examples?
10 | 
11 | 2) **Multi-arm Bandit Problems**
12 | 	* k-armed bandit problem
13 | 	* expected reward
14 | 	* exploration vs exploitation
15 | 
16 | 3) **Action Value Methods**
17 | 	* Sample Average method
18 | 	* greedy and $\epsilon$ greedy
19 | 	* The test-bed
20 | 	* Non-stationary problems
21 | 
22 | 4) **Improving Exploration in Simple Bandit Problem**
23 | 	* Optimistic Initial Values
24 | 	* Upper-Confidence Bound Action Selection
25 | 	* Gradient Bandit Algorithm
26 | 


--------------------------------------------------------------------------------
/ch_2_rl_in_non_associative/references.md:
--------------------------------------------------------------------------------
 1 | # Bibliography
 2 | 
 3 | 1) Non-associative Learning: [wiki](https://www.wikiwand.com/en/Learning),[better than wiki](https://www.britannica.com/topic/animal-learning/Types-of-learning#ref320590).
 4 | 
 5 | 2) Multi-arm Bandit Problem: [wiki](https://www.wikiwand.com/en/Multi-armed_bandit),[better than wiki](http://blog.thedataincubator.com/2016/07/multi-armed-bandits-2/).
 6 | 
 7 | 3) Action Value methods:[Paper Comparing various methods(also evolutionary solutions)](https://link.springer.com/article/10.1007/s12351-008-0007-5).
 8 | 
 9 | 4) Exploration vs exploitation:[David Silver's Slides(Includes formuation of regret)](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/XX.pdf)
10 | 
11 | 5) More Bandit Algorithms:[Book/wesite by Csaba Szepesvari](http://banditalgs.com/)
12 | 
13 | 6) Real life examples of MAB: [practo](https://www.youtube.com/watch?v=B1l_juzrw7Q),[Google Analytics A/B testing](https://support.google.com/analytics/answer/2844870?hl=en)
14 | 
15 | 
16 | ### Globally valuable content:
17 | 
18 | 1) [Princeton Slides on Exploration & Exploitation in Reinforcement Learning](https://www.cs.princeton.edu/courses/archive/fall16/cos402/lectures/402-lec22.pdf).
19 | 
20 | 2) [Michael Herrmann's Slides on Multi-arm Bandits(University of Edinburgh)](http://www.inf.ed.ac.uk/teaching/courses/rl/slides15/rl02.pdf).
21 | 
22 | 3) [David Silver's Slides on Exploration and Exploitation](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/XX.pdf).
23 | 
24 | 4) [The non-stochastic multi-armed bandit problem-Peter Auer](https://cseweb.ucsd.edu/~yfreund/papers/bandits.pdf).
25 | 
26 | 5) [Chapter-2 of Reinforcement Learning, An Introduction](ufal.mff.cuni.cz/~straka/courses/npfl114/2016/sutton-bookdraft2016sep.pdf).


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/.ipynb_checkpoints/RL_3 - Copy-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# <center>Finite Markov Decision Processes</center>\n",
 12 |     "\n",
 13 |     "### <center> Reference: Chapter 3, Sutton and Barto </center> "
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "slideshow": {
 20 |      "slide_type": "slide"
 21 |     }
 22 |    },
 23 |    "source": [
 24 |     "# <center>Contents</center>\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "slideshow": {
 31 |      "slide_type": "fragment"
 32 |     }
 33 |    },
 34 |    "source": [
 35 |     "* Why MDPs?"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {
 41 |     "slideshow": {
 42 |      "slide_type": "fragment"
 43 |     }
 44 |    },
 45 |    "source": [
 46 |     "* Markov Property"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {
 52 |     "slideshow": {
 53 |      "slide_type": "fragment"
 54 |     }
 55 |    },
 56 |    "source": [
 57 |     "* Building Blocks of MDP\n",
 58 |     "    * Episodic vs Continuous Tasks\n",
 59 |     "    * State Transition Matrix\n",
 60 |     "    * Return\n",
 61 |     "    * Discount\n",
 62 |     "    * Value Function\n",
 63 |     "    "
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {
 69 |     "slideshow": {
 70 |      "slide_type": "fragment"
 71 |     }
 72 |    },
 73 |    "source": [
 74 |     "* MDP Parameters\n",
 75 |     "    * Policy in MDP notations\n",
 76 |     "    * Value Functions in MDP notations"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {
 82 |     "slideshow": {
 83 |      "slide_type": "fragment"
 84 |     }
 85 |    },
 86 |    "source": [
 87 |     "* Bellman Expectation Equations"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {
 93 |     "slideshow": {
 94 |      "slide_type": "fragment"
 95 |     }
 96 |    },
 97 |    "source": [
 98 |     "* Bellman Optimal Equations\n"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": []
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {
109 |     "slideshow": {
110 |      "slide_type": "slide"
111 |     }
112 |    },
113 |    "source": [
114 |     "## <center>The Agent Environment Interface</center>\n",
115 |     "\n",
116 |     "\n",
117 |     "<center> <img src=\"img/1.png\" alt=\"MarkovProperty Definition\" style=\"width:700px;\"/> </center>"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {
123 |     "slideshow": {
124 |      "slide_type": "subslide"
125 |     }
126 |    },
127 |    "source": [
128 |     "* Markov decision processes formally **describe an environment** for reinforcement learning\n",
129 |     "* Where the environment is **fully observable**\n",
130 |     "* i.e. The **current state** completely characterises the process\n",
131 |     "* Almost all RL problems can be formalised as MDPs, e.g.\n",
132 |     "    * Optimal control primarily deals with continuous MDPs\n",
133 |     "    * Partially observable problems can be converted into MDPs\n",
134 |     "    * Bandits are MDPs with one state"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {
140 |     "slideshow": {
141 |      "slide_type": "slide"
142 |     }
143 |    },
144 |    "source": [
145 |     "# <center>Markov Property</center>"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {
151 |     "slideshow": {
152 |      "slide_type": "subslide"
153 |     }
154 |    },
155 |    "source": [
156 |     "“The future is independent of the past given the present”\n",
157 |     "<center> <img src=\"img/1.png\" alt=\"MarkovProperty Definition\" style=\"width:700px;\"/> </center>\n",
158 |     "\n",
159 |     "* The state captures all relevant information from the history\n",
160 |     "* Once the state is known, the history may be thrown away\n",
161 |     "* i.e. The state is a sufficient statistic of the future"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {
167 |     "slideshow": {
168 |      "slide_type": "slide"
169 |     }
170 |    },
171 |    "source": [
172 |     "# <center>Building Blocks of MDP</center>"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {
178 |     "slideshow": {
179 |      "slide_type": "subslide"
180 |     }
181 |    },
182 |    "source": [
183 |     "## Episodic vs Continuing Tasks\n",
184 |     "\n",
185 |     "### Episodic Tasks\n",
186 |     "* Each episode ends in a special state called the terminal state, \n",
187 |     "* Followed by a reset to a standard starting state or to a sample from a standard distribution of starting states. \n",
188 |     "\n",
189 |     "### Continuing Tasks\n",
190 |     "\n",
191 |     "* The agent–environment interaction does not break naturally into identifiable episodes.\n",
192 |     "* It goes on continually without limit. "
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {
198 |     "slideshow": {
199 |      "slide_type": "subslide"
200 |     }
201 |    },
202 |    "source": [
203 |     "## Unified Notation for Episodic and Continuous Tasks\n",
204 |     "\n",
205 |     "\n",
206 |     "\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {
212 |     "slideshow": {
213 |      "slide_type": "fragment"
214 |     }
215 |    },
216 |    "source": [
217 |     "#### Return for Episodic Tasks\n",
218 |     "sum over a finite number of terms"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {
224 |     "slideshow": {
225 |      "slide_type": "fragment"
226 |     }
227 |    },
228 |    "source": [
229 |     "#### Return for Continuous Tasks \n",
230 |     "sum over an infinite number of terms"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {
236 |     "slideshow": {
237 |      "slide_type": "fragment"
238 |     }
239 |    },
240 |    "source": [
241 |     "We need one convention to obtain a single notation that covers both episodic and continuing tasks.\n",
242 |     "\n",
243 |     "How to do that?"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {
249 |     "slideshow": {
250 |      "slide_type": "subslide"
251 |     }
252 |    },
253 |    "source": [
254 |     "These can be unified by considering episode termination to be the entering\n",
255 |     "of a **special absorbing state** that **transitions only to itself** and that **generates only\n",
256 |     "rewards of zero**. For example, consider the state transition diagram -\n",
257 |     "<center><img src=\"img/unified.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>\n",
258 |     "Hence, return can be written as-\n",
259 |     "<center><img src=\"img/return.png\" alt=\"Matrix\" style=\"width: 200px;\"/></center>"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {
265 |     "slideshow": {
266 |      "slide_type": "subslide"
267 |     }
268 |    },
269 |    "source": [
270 |     "## State Transition Matrix\n",
271 |     "<center><img src=\"img/2.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {
277 |     "slideshow": {
278 |      "slide_type": "subslide"
279 |     }
280 |    },
281 |    "source": [
282 |     "## Return\n",
283 |     "<center><img src=\"img/3.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>\n",
284 |     "* The discount $γ ∈ [0, 1]$ is the present value of future rewards\n",
285 |     "* The value of receiving reward R after k + 1 time-steps is $γ^k R$.\n",
286 |     "* This values immediate reward above delayed reward.\n",
287 |     "    * $γ$ close to 0 leads to ”myopic” evaluation\n",
288 |     "    * $γ$ close to 1 leads to ”far-sighted” evaluation"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {
294 |     "slideshow": {
295 |      "slide_type": "subslide"
296 |     }
297 |    },
298 |    "source": [
299 |     "## Discount \n",
300 |     "\n",
301 |     "Most Markov reward and decision processes are discounted. Why?\n",
302 |     "* Mathematically convenient to discount rewards\n",
303 |     "* Avoids infinite returns in cyclic Markov processes\n",
304 |     "* Uncertainty about the future may not be fully represented\n",
305 |     "* If the reward is financial, immediate rewards may earn more interest than delayed rewards\n",
306 |     "* Animal/human behaviour shows preference for immediate reward\n",
307 |     "* It is sometimes possible to use undiscounted Markov reward processes (i.e. $γ = 1$), e.g. if all sequences terminate."
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {
313 |     "slideshow": {
314 |      "slide_type": "subslide"
315 |     }
316 |    },
317 |    "source": [
318 |     "## Value Function\n",
319 |     "The value function $v(s)$ gives the long-term value of state s\n",
320 |     "<center><img src=\"img/5.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>\n"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {
326 |     "slideshow": {
327 |      "slide_type": "slide"
328 |     }
329 |    },
330 |    "source": [
331 |     "# <center>MDP Parameters</center>"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "markdown",
336 |    "metadata": {
337 |     "slideshow": {
338 |      "slide_type": "subslide"
339 |     }
340 |    },
341 |    "source": [
342 |     "A Markov decision process (MDP) is a Markov reward process with decisions. It is an environment in which all states are Markov.\n",
343 |     "\n",
344 |     "<center><img src=\"img/4.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {
350 |     "slideshow": {
351 |      "slide_type": "subslide"
352 |     }
353 |    },
354 |    "source": [
355 |     "## Policy in MDP notation\n",
356 |     "<center><img src=\"img/6.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>\n",
357 |     "* A policy fully defines the behaviour of an agent\n",
358 |     "* MDP policies depend on the current state (not the history)\n",
359 |     "* i.e. Policies are **stationary** (time-independent),\n",
360 |     "    $A_t ∼ π(·|S_t ), \\forall t > 0$\n",
361 |     "    "
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "metadata": {
367 |     "slideshow": {
368 |      "slide_type": "subslide"
369 |     }
370 |    },
371 |    "source": [
372 |     "## Policy in MDP notation\n",
373 |     "Given a MDP $M = \\left \\langle S, A, P, R, \\gamma \\right \\rangle$ and a policy $\\pi$\n",
374 |     "\n",
375 |     "$$P_{s,s'}^{\\pi} = \\sum_{a \\epsilon A} \\pi(a|s) P_{ss'}^{a}$$\n",
376 |     "$$R_{s}^{\\pi} = \\sum_{a \\epsilon A} \\pi(a|s) R_{s}^{a}$$"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {
382 |     "slideshow": {
383 |      "slide_type": "subslide"
384 |     }
385 |    },
386 |    "source": [
387 |     "## Example: Recycling Robot\n",
388 |     "<left><img src=\"img/robot.jpg\" alt=\"Matrix\" style=\"width: 100px;\"/></left>\n",
389 |     "** Task: ** Collect Empty soda cans in office\n",
390 |     "    \n",
391 |     "** Sensors: **\n",
392 |     "    \n",
393 |     "    1) Detector : For detecting cans\n",
394 |     "    2) Arm + Gripper : To pick up and place can in onboard bin\n",
395 |     "        \n",
396 |     "** Actions: **\n",
397 |     "    \n",
398 |     "    1) {Search} - Actively search for a can\n",
399 |     "    2) {Wait} - Remain stationary and wait for someone to bring a can. (Will lose less battery)\n",
400 |     "    3) {Recharge} - Head back home for recharging\n",
401 |     "    \n",
402 |     "** States: **\n",
403 |     "    \n",
404 |     "    1) high - Battery is charged considerably well\n",
405 |     "    2) low - Battery is not charged\n",
406 |     "    \n",
407 |     "** Rewards: ** \n",
408 |     "    \n",
409 |     "    1) zero most of the time\n",
410 |     "    2) become positive when the robot secures an empty can, \n",
411 |     "    3) negative if the battery runs all the way down\n",
412 |     "    \n",
413 |     "** <center>How can we formulate this as a MDP?</center> **"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {
419 |     "slideshow": {
420 |      "slide_type": "subslide"
421 |     }
422 |    },
423 |    "source": [
424 |     "## Value Function in MDP notation\n",
425 |     "<center><img src=\"img/7.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>\n"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "markdown",
430 |    "metadata": {
431 |     "slideshow": {
432 |      "slide_type": "slide"
433 |     }
434 |    },
435 |    "source": [
436 |     "# <center>Bellman Expectation Equation</center>"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "markdown",
441 |    "metadata": {
442 |     "slideshow": {
443 |      "slide_type": "subslide"
444 |     }
445 |    },
446 |    "source": [
447 |     "## Bellman Expectation Equation\n",
448 |     "<center><img src=\"img/b1.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {
454 |     "slideshow": {
455 |      "slide_type": "subslide"
456 |     }
457 |    },
458 |    "source": [
459 |     "## Bellman Expectation Equation for $V^\\pi$\n",
460 |     "<center><img src=\"img/b2.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {
466 |     "slideshow": {
467 |      "slide_type": "subslide"
468 |     }
469 |    },
470 |    "source": [
471 |     "## Bellman Expectation Equation for $Q^\\pi$\n",
472 |     "<center><img src=\"img/b3.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "markdown",
477 |    "metadata": {
478 |     "slideshow": {
479 |      "slide_type": "subslide"
480 |     }
481 |    },
482 |    "source": [
483 |     "## Bellman Expectation Equation for $v_\\pi$\n",
484 |     "<center><img src=\"img/b4.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "markdown",
489 |    "metadata": {
490 |     "slideshow": {
491 |      "slide_type": "subslide"
492 |     }
493 |    },
494 |    "source": [
495 |     "## Bellman Expectation Equation for $q_\\pi$\n",
496 |     "<center><img src=\"img/b5.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "markdown",
501 |    "metadata": {
502 |     "slideshow": {
503 |      "slide_type": "slide"
504 |     }
505 |    },
506 |    "source": [
507 |     "# <center>Bellman Optimality Equation</center>"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {
513 |     "slideshow": {
514 |      "slide_type": "subslide"
515 |     }
516 |    },
517 |    "source": [
518 |     "## Optimal Value Function\n",
519 |     "<center><img src=\"img/o.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "markdown",
524 |    "metadata": {
525 |     "slideshow": {
526 |      "slide_type": "subslide"
527 |     }
528 |    },
529 |    "source": [
530 |     "## Optimal Policy\n",
531 |     "<center><img src=\"img/o1.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "markdown",
536 |    "metadata": {
537 |     "slideshow": {
538 |      "slide_type": "subslide"
539 |     }
540 |    },
541 |    "source": [
542 |     "## Finding an Optimal Policy\n",
543 |     "<center><img src=\"img/o2.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "markdown",
548 |    "metadata": {
549 |     "slideshow": {
550 |      "slide_type": "subslide"
551 |     }
552 |    },
553 |    "source": [
554 |     "## Bellman Optimality Equation for $v_{*}$\n",
555 |     "<center><img src=\"img/op1.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "markdown",
560 |    "metadata": {
561 |     "slideshow": {
562 |      "slide_type": "subslide"
563 |     }
564 |    },
565 |    "source": [
566 |     "## Bellman Optimality Equation for $Q_{*}$\n",
567 |     "<center><img src=\"img/op2.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "markdown",
572 |    "metadata": {
573 |     "slideshow": {
574 |      "slide_type": "subslide"
575 |     }
576 |    },
577 |    "source": [
578 |     "## Bellman Optimality Equatin for $V^{*}$\n",
579 |     "<center><img src=\"img/op3.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {
585 |     "slideshow": {
586 |      "slide_type": "subslide"
587 |     }
588 |    },
589 |    "source": [
590 |     "## Bellman Optimality Equation for $Q^{*}$\n",
591 |     "<center><img src=\"img/op4.png\" alt=\"Matrix\" style=\"width: 1000px;\"/></center>"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "markdown",
596 |    "metadata": {
597 |     "slideshow": {
598 |      "slide_type": "slide"
599 |     }
600 |    },
601 |    "source": [
602 |     "## Summary\n",
603 |     "* We looked into the MDP formulation of a RL problem.\n",
604 |     "* We looked into the formulation of Value functions.\n",
605 |     "    * action-value pairs\n",
606 |     "    * state-action pairs\n",
607 |     "* Understood the motivation and necessity of Bellman Expectation Equations and Bellman Optimlality Equations"
608 |    ]
609 |   }
610 |  ],
611 |  "metadata": {
612 |   "celltoolbar": "Slideshow",
613 |   "kernelspec": {
614 |    "display_name": "Python [conda root]",
615 |    "language": "python",
616 |    "name": "conda-root-py"
617 |   },
618 |   "language_info": {
619 |    "codemirror_mode": {
620 |     "name": "ipython",
621 |     "version": 3
622 |    },
623 |    "file_extension": ".py",
624 |    "mimetype": "text/x-python",
625 |    "name": "python",
626 |    "nbconvert_exporter": "python",
627 |    "pygments_lexer": "ipython3",
628 |    "version": "3.5.2"
629 |   },
630 |   "widgets": {
631 |    "state": {},
632 |    "version": "1.1.2"
633 |   }
634 |  },
635 |  "nbformat": 4,
636 |  "nbformat_minor": 2
637 | }
638 | 


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/1.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/2.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/3.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/4.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/5.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/6.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/61.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/61.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/7.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/agent_env.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/agent_env.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/b1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b1.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/b2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b2.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/b3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b3.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/b4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b4.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/b5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b5.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/o.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/o.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/o1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/o1.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/o2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/o2.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/op1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/op1.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/op2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/op2.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/op3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/op3.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/op4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/op4.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/pic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/pic1.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/pic2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/pic2.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/return.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/return.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/robot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/robot.jpg


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_1.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_10.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_10.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_2.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_3.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_4.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_5.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_6.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_7.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_7.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_8.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_8.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/slides_9.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_9.PNG


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/img/unified.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/unified.png


--------------------------------------------------------------------------------
/ch_3_rl_finite_mdp/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Dynamic programming Methods
 3 | 
 4 | ### Reference: Chapter 3, Sutton and Barto
 5 | 
 6 | ## Contents:
 7 | 
 8 | 1) **Introduction**
 9 | 
10 | 2) **Building Blocks of MDP**
11 | 	* Markov Property
12 | 	* State Transition Matrix
13 | 	* Return
14 | 	* Discount
15 | 	* Value Function
16 | 
17 | 3) **MDP Parameters**
18 | 	* Policy in MDP Notations
19 | 	* Value Functions in MDP notations
20 | 
21 | 4) **Bellman Expectation Equations**
22 | 
23 | 5) **Bellman Optimality Equations**
24 | 


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/.ipynb_checkpoints/readme-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/a.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/aaa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/aaa.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/aaaa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/aaaa.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/async_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_1.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/async_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_2.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/async_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_3.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/async_ex_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_ex_1.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/async_ex_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_ex_2.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/async_ex_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_ex_3.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/async_ex_4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_ex_4.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/b.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/b1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/b1.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/b2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/b2.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/contr_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/contr_1.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/contr_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/contr_2.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/contr_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/contr_3.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/d.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/dp_ex_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/dp_ex_1.PNG


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/e.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/e1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/e1.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/e2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/e2.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/e3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/e3.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/p1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/p1.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/p2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/p2.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/s1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/s1.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/s11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/s11.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/s2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/s2.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/s3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/s3.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/sa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/sa.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/sb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/sb.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v1.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v2.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/v3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v3.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/v4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v4.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/img/v5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v5.png


--------------------------------------------------------------------------------
/ch_4_rl_dynamic_programming/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Dynamic programming Methods
 3 | 
 4 | ### Reference: Chapter 4, Sutton and Barto
 5 | 
 6 | ## Contents:
 7 | 
 8 | 1) **Recap:What is Dynamic Programming?**
 9 | 
10 | 2) **Planning by DP in MDP**
11 | 	* Iterative Policy Evaluation
12 | 	* Policy Improvement
13 | 
14 | 3) **Example: Gridworld (Policy Evaluation and Policy Improvment)**
15 | 
16 | 4) **Control**
17 | 	* Policy Iteration
18 | 	* Value Iteration
19 | 
20 | 5) **Synchronous/Asynchronous Dynamic Programming Algorithms**
21 | 
22 | 6) **Full-Width Backups/Sample Backups**
23 | 	* In-place Dynamic Programming
24 | 	* Prioritised Sweeping
25 | 	* Real-Time Dynamic Programming
26 | 
27 | 


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/.ipynb_checkpoints/readme-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/1.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/2.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/3.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/4.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/5.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/6.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a1.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/a2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a2.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/a3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a3.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/a4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a4.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/a5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a5.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/a6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a6.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/a7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a7.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/c1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c1.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/c2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c2.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/c3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c3.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/c4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c4.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/c5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c5.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/c6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c6.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/c7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c7.png


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/imp_sam_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_1.PNG


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/imp_sam_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_2.PNG


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/imp_sam_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_3.PNG


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/imp_sam_4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_4.PNG


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/img/imp_sam_5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_5.PNG


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/readme.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Monte-Carlo Methods\n",
 8 |     "\n",
 9 |     "### Reference: Chapter 5, Sutton and Barto\n",
10 |     "\n",
11 |     "## Contents:\n",
12 |     "\n",
13 |     "1) **Monte Carlo Prediction**\n",
14 |     "\n",
15 |     "2) **Monte Carlo Estimation of Action Values**\n",
16 |     "\n",
17 |     "3) **Monte Carlo Control**\n",
18 |     "\n",
19 |     "4) **Monte Carlo Control without Exploring Starts**\n",
20 |     "\n",
21 |     "5) **Off-policy Prediction via Importance Sampling**\n",
22 |     "\n",
23 |     "6) **Incremental Implementation**\n",
24 |     "\n",
25 |     "7) **Off-Policy Monte Carlo Control**\n",
26 |     "\n",
27 |     "8) **Return-Specific Importance Sampling**\n",
28 |     "\n",
29 |     "\n",
30 |     "\n",
31 |     "## Summary\n",
32 |     "\n",
33 |     "* Monte Carlo Approach\n",
34 |     "\n",
35 |     "* On-Policy Prediction\n",
36 |     "\n",
37 |     "* On-Policy Control (w/o Assumption: Exploring Starts)\n",
38 |     "\n",
39 |     "* Off-Policy Prediction\n",
40 |     "\n",
41 |     "* Off-Policy Control\n"
42 |    ]
43 |   }
44 |  ],
45 |  "metadata": {
46 |   "kernelspec": {
47 |    "display_name": "Python 2",
48 |    "language": "python",
49 |    "name": "python2"
50 |   },
51 |   "language_info": {
52 |    "codemirror_mode": {
53 |     "name": "ipython",
54 |     "version": 2
55 |    },
56 |    "file_extension": ".py",
57 |    "mimetype": "text/x-python",
58 |    "name": "python",
59 |    "nbconvert_exporter": "python",
60 |    "pygments_lexer": "ipython2",
61 |    "version": "2.7.12"
62 |   }
63 |  },
64 |  "nbformat": 4,
65 |  "nbformat_minor": 2
66 | }
67 | 


--------------------------------------------------------------------------------
/ch_5_rl_mc_methods/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Monte-Carlo Methods
 3 | 
 4 | ### Reference: Chapter 5, Sutton and Barto
 5 | 
 6 | ## Contents:
 7 | 
 8 | 1) **Introduction**
 9 | 	* Why move to MCMC from DP?
10 | 
11 | 2) **Monte Carlo Prediction (On-Policy)**
12 | 	* Problem Definition
13 | 	* First-Visit Monte Carlo Policy Evaluation
14 | 	* Every-Visit Monte Carlo Evaluation
15 | 		* Non-Stationary Evaluation
16 | 		* Some Important Facts
17 | 
18 | 3) **Monte Carlo Estimation of Action Values**
19 | 	* Why compute stat-action q(s,a) values, instead of v(s)?
20 | 
21 | 4) **Monte Carlo Control (On-Policy)**
22 | 	* Building up on Generalised Policy Iteration
23 | 	* Issues with the method
24 | 		* Assuming Infinite Number of Episodes
25 | 		* Problem of "maintaining exploration"
26 | 	* Dealing with the Issues
27 | 
28 | 5) **On-Policy vs Off-Policy Methods**
29 | 
30 | 6) **Off-Policy MC Predction**
31 | 	* Problem Definition
32 | 	* Importance Sampling
33 | 
34 | 7) **Off-Policy MC Control**
35 | 
36 | 7) **Off-Policy Monte Carlo Control**
37 | 
38 | 8) **Return-Specific Importance Sampling**
39 | 


--------------------------------------------------------------------------------
/ch_6_td_methods/img/10_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/10_2.png


--------------------------------------------------------------------------------
/ch_6_td_methods/img/DPback.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/DPback.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/MCback.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/MCback.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/TDback.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/TDback.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/TDex1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/TDex1.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/backup_q.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/backup_q.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/batch_td.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/batch_td.PNG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/bootsam.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/bootsam.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/cliff.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/cliff.jpg


--------------------------------------------------------------------------------
/ch_6_td_methods/img/doubleq.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/doubleq.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/ex_sarsa.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/ex_sarsa.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/maxbias.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/maxbias.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/mcvstd.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/mcvstd.PNG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/mcvstd_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/mcvstd_2.PNG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/qvssarsa.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/qvssarsa.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/tdex2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/tdex2.JPG


--------------------------------------------------------------------------------
/ch_6_td_methods/img/tdmarkov.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/tdmarkov.jpg


--------------------------------------------------------------------------------
/ch_6_td_methods/readme.md:
--------------------------------------------------------------------------------
 1 | # <center>Temporal Difference Methods</center>
 2 | 
 3 | #### <center> Reference: Chapter 6, Sutton and Barto</center>
 4 | 
 5 | ## Contents:
 6 | 
 7 | 1) **Introduction to TD methods**
 8 | 
 9 | 2) **Properties of Temporal Difference Methods**
10 | 	* TD Update
11 | 	* TD Error
12 | 	* MC vs TD
13 | 	* Examples
14 | 
15 | 3) **One-step, tabular, model-free TD methods**
16 | 	* SARSA
17 | 	* Q-Learning
18 | 	* Expected SARSA
19 | 	* Double Q-Learning
20 | 


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/.ipynb_checkpoints/RL_7-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# <center>Unifying MC Methods and TD Methods</center>\n",
 12 |     "## <center>Bootstrapping, TD($\\lambda$) and Eligibility Traces</center>\n",
 13 |     "### <center> Reference: Chapter 7 and Chapter 12, Sutton and Barto</center>"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "slideshow": {
 20 |      "slide_type": "slide"
 21 |     }
 22 |    },
 23 |    "source": [
 24 |     "## Contents\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "slideshow": {
 31 |      "slide_type": "fragment"
 32 |     }
 33 |    },
 34 |    "source": [
 35 |     "* Bootstrapping"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {
 41 |     "slideshow": {
 42 |      "slide_type": "fragment"
 43 |     }
 44 |    },
 45 |    "source": [
 46 |     "* TD ($\\lambda$)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {
 52 |     "slideshow": {
 53 |      "slide_type": "fragment"
 54 |     }
 55 |    },
 56 |    "source": [
 57 |     "* Eligibility Traces"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "slideshow": {
 64 |      "slide_type": "slide"
 65 |     }
 66 |    },
 67 |    "source": [
 68 |     "## <center> Bootstrapping </center>"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {
 74 |     "slideshow": {
 75 |      "slide_type": "subslide"
 76 |     }
 77 |    },
 78 |    "source": [
 79 |     "## Bootstrapping and Sampling\n",
 80 |     "<center><img src=\"img/1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {
 86 |     "slideshow": {
 87 |      "slide_type": "subslide"
 88 |     }
 89 |    },
 90 |    "source": [
 91 |     "## Why do Bootstrapping?\n",
 92 |     "\n",
 93 |     "* Free from tyranny of the time step\n",
 94 |     "* Sometimes updates are required at every step of transition (one-step TD)\n",
 95 |     "    * Take into account every possible transition/ anything that has changed\n",
 96 |     "* Sometimes, it makes sense to only update every few stansitions (multi-step TD)\n",
 97 |     "    * Take into account significant/considerable changes"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {
103 |     "slideshow": {
104 |      "slide_type": "subslide"
105 |     }
106 |    },
107 |    "source": [
108 |     "## n-Step Prediction\n",
109 |     "<center><img src=\"img/3.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {
115 |     "slideshow": {
116 |      "slide_type": "subslide"
117 |     }
118 |    },
119 |    "source": [
120 |     "## n-Step Return\n",
121 |     "<center><img src=\"img/4.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {
127 |     "slideshow": {
128 |      "slide_type": "subslide"
129 |     }
130 |    },
131 |    "source": [
132 |     "## n-Step Prediction\n",
133 |     "\n",
134 |     "<center><img src=\"img/prediction.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {
140 |     "slideshow": {
141 |      "slide_type": "subslide"
142 |     }
143 |    },
144 |    "source": [
145 |     "## n-Step SARSA (On-Policy Control)\n",
146 |     "\n",
147 |     "* simply switch states for actions (state–action pairs) and then use an $\\epsilon$-greedy policy.\n",
148 |     "* The n-step returns in terms of estimated action values:\n",
149 |     "<center><img src=\"img/return.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n",
150 |     "* Update made to a particular value of action-pair is as follows:\n",
151 |     "<center><img src=\"img/update.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {
157 |     "slideshow": {
158 |      "slide_type": "subslide"
159 |     }
160 |    },
161 |    "source": [
162 |     "## n-Step SARSA (On-policy Control)\n",
163 |     "\n",
164 |     "<center><img src=\"img/sc1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {
170 |     "slideshow": {
171 |      "slide_type": "subslide"
172 |     }
173 |    },
174 |    "source": [
175 |     "# n-Step SARSA Example\n",
176 |     "\n",
177 |     "<center><img src=\"img/eg1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {
183 |     "slideshow": {
184 |      "slide_type": "skip"
185 |     }
186 |    },
187 |    "source": [
188 |     "## n-Step SARSA (On-policy Control)\n",
189 |     "\n",
190 |     "<center><img src=\"img/nStepOnline.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {
196 |     "slideshow": {
197 |      "slide_type": "subslide"
198 |     }
199 |    },
200 |    "source": [
201 |     "## n-Step Off-Policy Control (with Importance Sampling)\n",
202 |     "\n",
203 |     "* learning the value function for one policy, π, while following another policy, μ\n",
204 |     "* Often, π is the greedy policy for the current action-value-function estimate, and μ is a more exploratory policy, perhaps ε-greedy\n",
205 |     "* we must take into account the difference between the two policies, using their relative probability of taking the actions that were taken\n",
206 |     "* To measure this difference, we use the importance sampling ratio. \n",
207 |     "* Only difference, that instead of measuring it for the entire episode, we measure it for n-steps. "
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {
213 |     "slideshow": {
214 |      "slide_type": "subslide"
215 |     }
216 |    },
217 |    "source": [
218 |     "* The importance sampling ratio looks liks:\n",
219 |     "<center><img src=\"img/return1.png\" alt=\"RewardHypothesis\" style=\"width: 300px;\"/></center>\n",
220 |     "* The update Equatin looks like this:\n",
221 |     "<center><img src=\"img/update1.png\" alt=\"RewardHypothesis\" style=\"width: 700px;\"/></center>\n",
222 |     "\n",
223 |     "<center><img src=\"img/sc1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {
229 |     "slideshow": {
230 |      "slide_type": "skip"
231 |     }
232 |    },
233 |    "source": [
234 |     "## n-Step Off-Policy Control (with Importance Sampling)\n",
235 |     "<center><img src=\"img/importancesampling.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {
241 |     "slideshow": {
242 |      "slide_type": "subslide"
243 |     }
244 |    },
245 |    "source": [
246 |     "## Off-Policy Control (w/o Importance Sampling => Tree BackUp Algorithm)\n",
247 |     "* This backup is an alternating mix of sample transitions—from each action to the su bsequent state—and full backups—from each state we consider all the possible actions, their probability of occuring under π, and their action values.\n",
248 |     "<center><img src=\"img/bp.png\" alt=\"RewardHypothesis\" style=\"width: 100px;\"/></center>\n"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {
254 |     "slideshow": {
255 |      "slide_type": "subslide"
256 |     }
257 |    },
258 |    "source": [
259 |     "* Returns and updates are calculated as follows:\n",
260 |     "<center><img src=\"img/eq1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n",
261 |     "<center><img src=\"img/eq2.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {
267 |     "slideshow": {
268 |      "slide_type": "subslide"
269 |     }
270 |    },
271 |    "source": [
272 |     "## Unified view of Reinforcement Learning\n",
273 |     "<center><img src=\"img/2.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {
279 |     "slideshow": {
280 |      "slide_type": "slide"
281 |     }
282 |    },
283 |    "source": [
284 |     "## <center> TD($\\lambda$) </center>"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {
290 |     "slideshow": {
291 |      "slide_type": "subslide"
292 |     }
293 |    },
294 |    "source": [
295 |     "## Averaging n-Step Returns\n",
296 |     "<center><img src=\"img/5.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {
302 |     "slideshow": {
303 |      "slide_type": "subslide"
304 |     }
305 |    },
306 |    "source": [
307 |     "## $\\lambda$ Returns\n",
308 |     "<center><img src=\"img/6.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {
314 |     "slideshow": {
315 |      "slide_type": "subslide"
316 |     }
317 |    },
318 |    "source": [
319 |     "## TD(λ) Weighting Function\n",
320 |     "<center><img src=\"img/7.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {
326 |     "slideshow": {
327 |      "slide_type": "subslide"
328 |     }
329 |    },
330 |    "source": [
331 |     "## Forward-view TD(λ)\n",
332 |     "<center><img src=\"img/8.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {
338 |     "slideshow": {
339 |      "slide_type": "subslide"
340 |     }
341 |    },
342 |    "source": [
343 |     "## Bckward-view TD(λ)\n",
344 |     "<center><img src=\"img/9.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {
350 |     "collapsed": true,
351 |     "slideshow": {
352 |      "slide_type": "slide"
353 |     }
354 |    },
355 |    "source": [
356 |     "## <center> Eligibility Traces </center>"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {
362 |     "slideshow": {
363 |      "slide_type": "subslide"
364 |     }
365 |    },
366 |    "source": [
367 |     "## Eligibility Traces"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {
373 |     "slideshow": {
374 |      "slide_type": "fragment"
375 |     }
376 |    },
377 |    "source": [
378 |     "\n",
379 |     "<center><img src=\"img/10_1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "markdown",
384 |    "metadata": {
385 |     "slideshow": {
386 |      "slide_type": "fragment"
387 |     }
388 |    },
389 |    "source": [
390 |     "\n",
391 |     "<center><img src=\"img/10_2.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {
397 |     "slideshow": {
398 |      "slide_type": "subslide"
399 |     }
400 |    },
401 |    "source": [
402 |     "## Backward View TD(λ)\n",
403 |     "<center><img src=\"img/11.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "markdown",
408 |    "metadata": {
409 |     "slideshow": {
410 |      "slide_type": "subslide"
411 |     }
412 |    },
413 |    "source": [
414 |     "## TD(λ) and TD(0)\n",
415 |     "<center><img src=\"img/12.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "metadata": {
421 |     "slideshow": {
422 |      "slide_type": "subslide"
423 |     }
424 |    },
425 |    "source": [
426 |     "## TD(λ) and MC\n",
427 |     "<center><img src=\"img/13.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {
433 |     "slideshow": {
434 |      "slide_type": "subslide"
435 |     }
436 |    },
437 |    "source": [
438 |     "## MC and TD(1)\n",
439 |     "<center><img src=\"img/14.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n",
440 |     "\n"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {
446 |     "slideshow": {
447 |      "slide_type": "subslide"
448 |     }
449 |    },
450 |    "source": [
451 |     "## Telescoping in TD(1)\n",
452 |     "<center><img src=\"img/15.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {
458 |     "slideshow": {
459 |      "slide_type": "subslide"
460 |     }
461 |    },
462 |    "source": [
463 |     "## TD(λ) and TD(1)\n",
464 |     "<center><img src=\"img/16.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "metadata": {
470 |     "slideshow": {
471 |      "slide_type": "subslide"
472 |     }
473 |    },
474 |    "source": [
475 |     "## Telescoping in TD(λ)\n",
476 |     "<center><img src=\"img/17.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {
482 |     "slideshow": {
483 |      "slide_type": "subslide"
484 |     }
485 |    },
486 |    "source": [
487 |     "## Forwards and Backwards TD(λ)\n",
488 |     "<center><img src=\"img/18.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "metadata": {
494 |     "slideshow": {
495 |      "slide_type": "subslide"
496 |     }
497 |    },
498 |    "source": [
499 |     "## Offline Equivalence of Forward and Backward TD\n",
500 |     "<center><img src=\"img/19.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {
506 |     "slideshow": {
507 |      "slide_type": "subslide"
508 |     }
509 |    },
510 |    "source": [
511 |     "## Online Equivalence of Forward and Backward TD\n",
512 |     "<center><img src=\"img/20.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "markdown",
517 |    "metadata": {
518 |     "slideshow": {
519 |      "slide_type": "subslide"
520 |     }
521 |    },
522 |    "source": [
523 |     "## Summary of Forward and Backward TD(λ)\n",
524 |     "<center><img src=\"img/21.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
525 |    ]
526 |   }
527 |  ],
528 |  "metadata": {
529 |   "anaconda-cloud": {},
530 |   "celltoolbar": "Slideshow",
531 |   "kernelspec": {
532 |    "display_name": "Python [conda root]",
533 |    "language": "python",
534 |    "name": "conda-root-py"
535 |   },
536 |   "language_info": {
537 |    "codemirror_mode": {
538 |     "name": "ipython",
539 |     "version": 3
540 |    },
541 |    "file_extension": ".py",
542 |    "mimetype": "text/x-python",
543 |    "name": "python",
544 |    "nbconvert_exporter": "python",
545 |    "pygments_lexer": "ipython3",
546 |    "version": "3.5.2"
547 |   },
548 |   "widgets": {
549 |    "state": {},
550 |    "version": "1.1.2"
551 |   }
552 |  },
553 |  "nbformat": 4,
554 |  "nbformat_minor": 2
555 | }
556 | 


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/RL_7.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# <center>Unifying MC Methods and TD Methods</center>\n",
 12 |     "## <center>Bootstrapping, TD($\\lambda$) and Eligibility Traces</center>\n",
 13 |     "### <center> Reference: Chapter 7 and Chapter 12, Sutton and Barto</center>"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "slideshow": {
 20 |      "slide_type": "slide"
 21 |     }
 22 |    },
 23 |    "source": [
 24 |     "## Contents\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "slideshow": {
 31 |      "slide_type": "fragment"
 32 |     }
 33 |    },
 34 |    "source": [
 35 |     "* Bootstrapping"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {
 41 |     "slideshow": {
 42 |      "slide_type": "fragment"
 43 |     }
 44 |    },
 45 |    "source": [
 46 |     "* TD ($\\lambda$)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {
 52 |     "slideshow": {
 53 |      "slide_type": "fragment"
 54 |     }
 55 |    },
 56 |    "source": [
 57 |     "* Eligibility Traces"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "slideshow": {
 64 |      "slide_type": "slide"
 65 |     }
 66 |    },
 67 |    "source": [
 68 |     "## <center> Bootstrapping </center>"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {
 74 |     "slideshow": {
 75 |      "slide_type": "subslide"
 76 |     }
 77 |    },
 78 |    "source": [
 79 |     "## Bootstrapping and Sampling\n",
 80 |     "<center><img src=\"img/1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {
 86 |     "slideshow": {
 87 |      "slide_type": "subslide"
 88 |     }
 89 |    },
 90 |    "source": [
 91 |     "## Why do Bootstrapping?\n",
 92 |     "\n",
 93 |     "* Free from tyranny of the time step\n",
 94 |     "* Sometimes updates are required at every step of transition (one-step TD)\n",
 95 |     "    * Take into account every possible transition/ anything that has changed\n",
 96 |     "* Sometimes, it makes sense to only update every few stansitions (multi-step TD)\n",
 97 |     "    * Take into account significant/considerable changes"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {
103 |     "slideshow": {
104 |      "slide_type": "subslide"
105 |     }
106 |    },
107 |    "source": [
108 |     "## n-Step Prediction\n",
109 |     "<center><img src=\"img/3.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {
115 |     "slideshow": {
116 |      "slide_type": "subslide"
117 |     }
118 |    },
119 |    "source": [
120 |     "## n-Step Return\n",
121 |     "<center><img src=\"img/4.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {
127 |     "slideshow": {
128 |      "slide_type": "subslide"
129 |     }
130 |    },
131 |    "source": [
132 |     "## n-Step Prediction\n",
133 |     "\n",
134 |     "<center><img src=\"img/prediction.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {
140 |     "slideshow": {
141 |      "slide_type": "subslide"
142 |     }
143 |    },
144 |    "source": [
145 |     "## n-Step SARSA (On-Policy Control)\n",
146 |     "\n",
147 |     "* simply switch states for actions (state–action pairs) and then use an $\\epsilon$-greedy policy.\n",
148 |     "* The n-step returns in terms of estimated action values:\n",
149 |     "<center><img src=\"img/return.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n",
150 |     "* Update made to a particular value of action-pair is as follows:\n",
151 |     "<center><img src=\"img/update.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {
157 |     "slideshow": {
158 |      "slide_type": "subslide"
159 |     }
160 |    },
161 |    "source": [
162 |     "## n-Step SARSA (On-policy Control)\n",
163 |     "\n",
164 |     "<center><img src=\"img/sc1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {
170 |     "slideshow": {
171 |      "slide_type": "subslide"
172 |     }
173 |    },
174 |    "source": [
175 |     "# n-Step SARSA Example\n",
176 |     "\n",
177 |     "<center><img src=\"img/eg1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {
183 |     "slideshow": {
184 |      "slide_type": "skip"
185 |     }
186 |    },
187 |    "source": [
188 |     "## n-Step SARSA (On-policy Control)\n",
189 |     "\n",
190 |     "<center><img src=\"img/nStepOnline.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {
196 |     "slideshow": {
197 |      "slide_type": "subslide"
198 |     }
199 |    },
200 |    "source": [
201 |     "## n-Step Off-Policy Control (with Importance Sampling)\n",
202 |     "\n",
203 |     "* learning the value function for one policy, π, while following another policy, μ\n",
204 |     "* Often, π is the greedy policy for the current action-value-function estimate, and μ is a more exploratory policy, perhaps ε-greedy\n",
205 |     "* we must take into account the difference between the two policies, using their relative probability of taking the actions that were taken\n",
206 |     "* To measure this difference, we use the importance sampling ratio. \n",
207 |     "* Only difference, that instead of measuring it for the entire episode, we measure it for n-steps. "
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {
213 |     "slideshow": {
214 |      "slide_type": "subslide"
215 |     }
216 |    },
217 |    "source": [
218 |     "* The importance sampling ratio looks liks:\n",
219 |     "<center><img src=\"img/return1.png\" alt=\"RewardHypothesis\" style=\"width: 300px;\"/></center>\n",
220 |     "* The update Equatin looks like this:\n",
221 |     "<center><img src=\"img/update1.png\" alt=\"RewardHypothesis\" style=\"width: 700px;\"/></center>\n",
222 |     "\n",
223 |     "<center><img src=\"img/sc1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {
229 |     "slideshow": {
230 |      "slide_type": "skip"
231 |     }
232 |    },
233 |    "source": [
234 |     "## n-Step Off-Policy Control (with Importance Sampling)\n",
235 |     "<center><img src=\"img/importancesampling.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {
241 |     "slideshow": {
242 |      "slide_type": "subslide"
243 |     }
244 |    },
245 |    "source": [
246 |     "## Off-Policy Control (w/o Importance Sampling => Tree BackUp Algorithm)\n",
247 |     "* This backup is an alternating mix of sample transitions—from each action to the su bsequent state—and full backups—from each state we consider all the possible actions, their probability of occuring under π, and their action values.\n",
248 |     "<center><img src=\"img/bp.png\" alt=\"RewardHypothesis\" style=\"width: 100px;\"/></center>\n"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {
254 |     "slideshow": {
255 |      "slide_type": "subslide"
256 |     }
257 |    },
258 |    "source": [
259 |     "* Returns and updates are calculated as follows:\n",
260 |     "<center><img src=\"img/eq1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n",
261 |     "<center><img src=\"img/eq2.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {
267 |     "slideshow": {
268 |      "slide_type": "subslide"
269 |     }
270 |    },
271 |    "source": [
272 |     "## Unified view of Reinforcement Learning\n",
273 |     "<center><img src=\"img/2.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {
279 |     "slideshow": {
280 |      "slide_type": "slide"
281 |     }
282 |    },
283 |    "source": [
284 |     "## <center> TD($\\lambda$) </center>"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {
290 |     "slideshow": {
291 |      "slide_type": "subslide"
292 |     }
293 |    },
294 |    "source": [
295 |     "## Averaging n-Step Returns\n",
296 |     "<center><img src=\"img/5.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {
302 |     "slideshow": {
303 |      "slide_type": "subslide"
304 |     }
305 |    },
306 |    "source": [
307 |     "## $\\lambda$ Returns\n",
308 |     "<center><img src=\"img/6.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {
314 |     "slideshow": {
315 |      "slide_type": "subslide"
316 |     }
317 |    },
318 |    "source": [
319 |     "## TD(λ) Weighting Function\n",
320 |     "<center><img src=\"img/7.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {
326 |     "slideshow": {
327 |      "slide_type": "subslide"
328 |     }
329 |    },
330 |    "source": [
331 |     "## Forward-view TD(λ)\n",
332 |     "<center><img src=\"img/8.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {
338 |     "slideshow": {
339 |      "slide_type": "subslide"
340 |     }
341 |    },
342 |    "source": [
343 |     "## Bckward-view TD(λ)\n",
344 |     "<center><img src=\"img/9.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {
350 |     "collapsed": true,
351 |     "slideshow": {
352 |      "slide_type": "slide"
353 |     }
354 |    },
355 |    "source": [
356 |     "## <center> Eligibility Traces </center>"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {
362 |     "slideshow": {
363 |      "slide_type": "subslide"
364 |     }
365 |    },
366 |    "source": [
367 |     "## Eligibility Traces"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {
373 |     "slideshow": {
374 |      "slide_type": "fragment"
375 |     }
376 |    },
377 |    "source": [
378 |     "\n",
379 |     "<center><img src=\"img/10_1.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "markdown",
384 |    "metadata": {
385 |     "slideshow": {
386 |      "slide_type": "fragment"
387 |     }
388 |    },
389 |    "source": [
390 |     "\n",
391 |     "<center><img src=\"img/10_2.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {
397 |     "slideshow": {
398 |      "slide_type": "subslide"
399 |     }
400 |    },
401 |    "source": [
402 |     "## Backward View TD(λ)\n",
403 |     "<center><img src=\"img/11.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "markdown",
408 |    "metadata": {
409 |     "slideshow": {
410 |      "slide_type": "subslide"
411 |     }
412 |    },
413 |    "source": [
414 |     "## TD(λ) and TD(0)\n",
415 |     "<center><img src=\"img/12.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "metadata": {
421 |     "slideshow": {
422 |      "slide_type": "subslide"
423 |     }
424 |    },
425 |    "source": [
426 |     "## TD(λ) and MC\n",
427 |     "<center><img src=\"img/13.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {
433 |     "slideshow": {
434 |      "slide_type": "subslide"
435 |     }
436 |    },
437 |    "source": [
438 |     "## MC and TD(1)\n",
439 |     "<center><img src=\"img/14.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n",
440 |     "\n"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {
446 |     "slideshow": {
447 |      "slide_type": "subslide"
448 |     }
449 |    },
450 |    "source": [
451 |     "## Telescoping in TD(1)\n",
452 |     "<center><img src=\"img/15.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {
458 |     "slideshow": {
459 |      "slide_type": "subslide"
460 |     }
461 |    },
462 |    "source": [
463 |     "## TD(λ) and TD(1)\n",
464 |     "<center><img src=\"img/16.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "metadata": {
470 |     "slideshow": {
471 |      "slide_type": "subslide"
472 |     }
473 |    },
474 |    "source": [
475 |     "## Telescoping in TD(λ)\n",
476 |     "<center><img src=\"img/17.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {
482 |     "slideshow": {
483 |      "slide_type": "subslide"
484 |     }
485 |    },
486 |    "source": [
487 |     "## Forwards and Backwards TD(λ)\n",
488 |     "<center><img src=\"img/18.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "metadata": {
494 |     "slideshow": {
495 |      "slide_type": "subslide"
496 |     }
497 |    },
498 |    "source": [
499 |     "## Offline Equivalence of Forward and Backward TD\n",
500 |     "<center><img src=\"img/19.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>\n"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {
506 |     "slideshow": {
507 |      "slide_type": "subslide"
508 |     }
509 |    },
510 |    "source": [
511 |     "## Online Equivalence of Forward and Backward TD\n",
512 |     "<center><img src=\"img/20.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "markdown",
517 |    "metadata": {
518 |     "slideshow": {
519 |      "slide_type": "subslide"
520 |     }
521 |    },
522 |    "source": [
523 |     "## Summary of Forward and Backward TD(λ)\n",
524 |     "<center><img src=\"img/21.png\" alt=\"RewardHypothesis\" style=\"width: 1000px;\"/></center>"
525 |    ]
526 |   }
527 |  ],
528 |  "metadata": {
529 |   "anaconda-cloud": {},
530 |   "celltoolbar": "Slideshow",
531 |   "kernelspec": {
532 |    "display_name": "Python [conda root]",
533 |    "language": "python",
534 |    "name": "conda-root-py"
535 |   },
536 |   "language_info": {
537 |    "codemirror_mode": {
538 |     "name": "ipython",
539 |     "version": 3
540 |    },
541 |    "file_extension": ".py",
542 |    "mimetype": "text/x-python",
543 |    "name": "python",
544 |    "nbconvert_exporter": "python",
545 |    "pygments_lexer": "ipython3",
546 |    "version": "3.5.2"
547 |   },
548 |   "widgets": {
549 |    "state": {},
550 |    "version": "1.1.2"
551 |   }
552 |  },
553 |  "nbformat": 4,
554 |  "nbformat_minor": 2
555 | }
556 | 


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/1.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/10.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/10_1.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/10_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/10_2.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/11.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/12.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/13.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/14.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/15.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/16.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/17.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/18.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/19.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/2.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/20.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/21.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/3.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/4.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/5.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/6.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/7.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/8.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/9.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/bp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/bp.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/eg1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/eg1.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/eq1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/eq1.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/eq2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/eq2.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/importancesampling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/importancesampling.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/nStepOnline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/nStepOnline.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/prediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/prediction.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/return.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/return.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/return1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/return1.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/sc1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/sc1.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/sc2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/sc2.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/update.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/update.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/img/update1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/update1.png


--------------------------------------------------------------------------------
/ch_7_rl_eligibility_traces/readme.md:
--------------------------------------------------------------------------------
 1 | # <center>Unifying MC and TD Methods</center>
 2 | 
 3 | #### <center> Reference: Chapter 7 and Chapter 12, Sutton and Barto</center>
 4 | 
 5 | ## Contents:
 6 | 
 7 | 1) **Bootstrapping**
 8 | 	* Why do Bootstrapping?
 9 | 	* n-Step SARSA (On-Policy Control)
10 | 	* n-Step Off-Policy Control (with Importance Sampling)
11 | 	* N-Step Off-Policy Control (w/o Importance Sampling => Tree BackUp Algorithm)
12 | 
13 | 2) **TD($\lambda$)**
14 | 	* Forward View
15 | 	* Backward View
16 | 
17 | 3) **Eligibility Traces**
18 | 


--------------------------------------------------------------------------------
/ch_8_model_based/img/dyna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dyna.jpeg


--------------------------------------------------------------------------------
/ch_8_model_based/img/dyna_eq.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dyna_eq.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/dyna_perf.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dyna_perf.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/dyna_perf2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dyna_perf2.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/dynaenvchange.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dynaenvchange.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/dynaenvchange2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dynaenvchange2.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/mcts0.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts0.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/mcts1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts1.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/mcts2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts2.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/mcts3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts3.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/mcts4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts4.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/mcts5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts5.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/mctssearch1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mctssearch1.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/mctssearch2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mctssearch2.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/mctssteps.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mctssteps.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/modelbasedplanning.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/modelbasedplanning.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/psweep.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/psweep.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/psweep_ex.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/psweep_ex.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/simmontesearch.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/simmontesearch.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/img/sslearning.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/sslearning.JPG


--------------------------------------------------------------------------------
/ch_8_model_based/readme.md:
--------------------------------------------------------------------------------
 1 | # <center>Planning and Learning with Tabular Methods</center>
 2 | 
 3 | #### <center> Reference: Chapter 8, Sutton and Barto</center>
 4 | 
 5 | ## Contents:
 6 | 
 7 | 1) **Introduction**
 8 | 	
 9 | 2) **Model Based RL**
10 | 
11 | 3) **Dyna: Integrating Planning, Acting and Learning**
12 | 
13 | 4) **Prioritizing Sweeps**
14 | 
15 | 5) **Planning as a part of Action Selection (Monte Carlo Tree Search)**
16 | 


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_avg1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg1.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_avg2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg2.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_avg3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg3.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_avg4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg4.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_avg5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg5.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_avg6.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg6.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_avg7.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg7.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_avg8.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg8.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_avg9.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg9.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_prob1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob1.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_prob2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob2.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_prob3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob3.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_prob4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob4.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_prob5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob5.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_prob6.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob6.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_prob7.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob7.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_prob8.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob8.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slide1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slide1.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides10.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides10.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides11.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides11.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides12.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides12.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides13.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides13.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides14.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides14.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides2.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides3.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides4.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides5.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides6.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides6.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides7.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides7.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides8.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides8.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/fa_slides9.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides9.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/img/func_approx.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/func_approx.JPG


--------------------------------------------------------------------------------
/ch_9_func_approx_1/readme.md:
--------------------------------------------------------------------------------
 1 | # <center>Value Function Approximation (Part I)</center>
 2 | 
 3 | #### <center> Reference: Chapter 9 to Chapter 11, Sutton and Barto</center>
 4 | 
 5 | ## Contents:
 6 | 
 7 | 1) **Introduction**
 8 | 	* Why move to non-tabular mthods?
 9 | 	* Value Approximation
10 | 
11 | 2) **Incremental Methods**
12 | 	* Gradient Descent
13 | 	* The Predictive Objective (MSVE)
14 | 	* SGD for MSVE
15 | 	* Feature Vector
16 | 	* Linear Function Approximator
17 | 
18 | 3) **Incremental Prediction Methods**
19 | 	* Target as MC
20 | 	* Target as TD(0)
21 | 	* Target as TD($\lambda$)
22 | 
23 | 4) **Iterative Control Approximation**
24 | 	* Target as MC
25 | 	* Target as TD(0)
26 | 	* Target as TD($\lambda$)
27 | 
28 | 5) **Average Reward Setting**
29 | 


--------------------------------------------------------------------------------
/img/break_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/img/break_1.png


--------------------------------------------------------------------------------
/img/motivation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/img/motivation.png


--------------------------------------------------------------------------------
/img/statement_hinton_bengio_lecun.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/img/statement_hinton_bengio_lecun.png


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Deep Reinforcement Learning in Computer Vision
 2 | 
 3 | [DRL in CV Website](https://bardofcodes.github.io/DRL_in_CV_Papers/)
 4 | 
 5 | **[Google Slides on this introduction](https://docs.google.com/presentation/d/1Nnt6Jj77SLECbeb3m_Y6lqDEH0t8CUePJt2U-HlT6ZM/edit?usp=sharing)**
 6 | 
 7 | In recent years, while use of Computer Vision techniques/models has burgeoned
 8 | for solving Reinforcement Learning task(such as games), the opposite flow, of
 9 | using techinques/models from Reinforcement Learning to solve paradigms in
10 | Computer Vision has also been seen.
11 | 
12 | Additionally, from a few stalwarts of Computer Vision:
13 | 
14 | <center><img src="img/statement_hinton_bengio_lecun.png" alt="Bold Statement" style=""></center>
15 | 
16 | This indicates that just as researchers in Reinforcement learning benifited from
17 | understanding and applying Computer vision techniques, researchers in
18 | Computer Vision can benifit from not treating Reinforcement learning as an esoteric
19 | black box and gaining a comprehensive understanding of this subject.
20 | 
21 | Hence, we are presenting a short series of lectures,(at our lab) with the following motivation:
22 | 
23 | <center><img src="img/motivation.png" alt="motivations" style=""></center>
24 | 
25 | # DRL in CV Papers
26 | An Additional repository has been made [DRL_in_CV_Papers](https://github.com/BardOfCodes/DRL_in_CV_Papers), which consist of a list of published works in computer vision which use Deep Reinforcement learning. A few of the papers have an added blog-post on them as well, highlighting important parts of the paper.
27 | 
28 | # Posts
29 | Additionally, for some topics which are important but might not have been a good slide presentation, we have made blog-like posts. This section will see further additions.
30 | It is open for additional posts from all. Kindly look in the `_post` folder for more information.
31 | 
32 | # Acknowledgement
33 | 
34 | We rely heavily on the following for the content. This work is mostly curation
35 | of the excellant material already provided by these brilliant creators:
36 | 
37 | * Reinforcement Learning: An Introduction- Book by Andrew Barto and Richard S. Sutton.
38 | [Link to latest draft](ufal.mff.cuni.cz/~straka/courses/npfl114/2016/sutton-bookdraft2016sep.pdf).
39 | * UCL course on RL - Course by David Silver.[Link to material](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html).
40 | * Code material from various amazing sources: [DannyBritz](https://github.com/dennybritz/reinforcement-learning),
41 | [ShangtongZhang](https://github.com/ShangtongZhang/reinforcement-learning-an-introduction),
42 | [AndrejKarapathy](https://github.com/karpathy/reinforcejs).
43 | 
44 | This work has been complied by Aditya Ganeshan and Trisha Mittal while working at [Video Analytics Lab(VAL),IISc](http://val.serc.iisc.ernet.in/valweb/). We thank the lab for giving us this opportunity.
45 | 
46 | ##### Tutorials are still to be added for most chapters.
47 | 


--------------------------------------------------------------------------------