├── .gitignore ├── .gitmodules ├── LICENSE.md ├── ch_0_rl_rl_in_cv ├── RL_0.ipynb ├── RL_0.slides.html ├── img │ ├── 1.png │ ├── 1_1.png │ ├── 1_10.png │ ├── 1_11.png │ ├── 1_12.png │ ├── 1_2.png │ ├── 1_3.png │ ├── 1_4.png │ ├── 1_5.png │ ├── 1_6.png │ ├── 1_7.png │ ├── 1_8.png │ ├── 1_9.png │ ├── 2.png │ ├── 2_1.png │ ├── 2_10.png │ ├── 2_11.png │ ├── 2_13.png │ ├── 2_2.png │ ├── 2_3.png │ ├── 2_4.png │ ├── 2_5.png │ ├── 2_7.png │ ├── 2_8.png │ ├── 2_9.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ ├── a.png │ ├── adnet_1.PNG │ ├── adnet_2.PNG │ ├── adnet_3.PNG │ ├── adnet_4.PNG │ ├── adnet_5.PNG │ ├── adnet_6.PNG │ ├── adnet_7.PNG │ ├── adnet_8.PNG │ ├── b.png │ ├── c.png │ ├── d.png │ ├── e.png │ ├── f.png │ ├── x.png │ ├── y.png │ └── z.png └── readme.md ├── ch_10_func_approx_2 ├── RL_FA2.ipynb ├── RL_FA2.slides.html ├── img │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── ex.png │ ├── fa2_ex1.JPG │ ├── fa2_ex2.JPG │ ├── fa2_ex3.JPG │ ├── fa2_slides1.JPG │ ├── fa2_slides10.JPG │ ├── fa2_slides11.JPG │ ├── fa2_slides12.JPG │ ├── fa2_slides2.JPG │ ├── fa2_slides3.JPG │ ├── fa2_slides4.JPG │ ├── fa2_slides5.JPG │ ├── fa2_slides6.JPG │ ├── fa2_slides7.JPG │ ├── fa2_slides8.JPG │ └── fa2_slides9.JPG └── readme.md ├── ch_11_policy_gradient ├── RL_11.ipynb ├── RL_11.slides.html ├── img │ ├── pg_1.JPG │ ├── pg_10.JPG │ ├── pg_11.JPG │ ├── pg_12.JPG │ ├── pg_13.JPG │ ├── pg_14.JPG │ ├── pg_15.JPG │ ├── pg_16.JPG │ ├── pg_17.JPG │ ├── pg_18.JPG │ ├── pg_19.JPG │ ├── pg_2.JPG │ ├── pg_20.JPG │ ├── pg_21.JPG │ ├── pg_22.JPG │ ├── pg_23.JPG │ ├── pg_24.JPG │ ├── pg_25.JPG │ ├── pg_26.JPG │ ├── pg_27.JPG │ ├── pg_28.JPG │ ├── pg_29.JPG │ ├── pg_3.JPG │ ├── pg_30.JPG │ ├── pg_34.JPG │ ├── pg_4.JPG │ ├── pg_5.JPG │ ├── pg_6.JPG │ ├── pg_7.JPG │ ├── pg_8.JPG │ ├── pg_9.JPG │ ├── sutton_1.JPG │ ├── sutton_2.JPG │ ├── sutton_3.JPG │ ├── sutton_4.JPG │ └── sutton_5.JPG └── readme.md ├── ch_1_rl_intro ├── .ipynb_checkpoints │ ├── RL_1-checkpoint.ipynb │ └── readme-checkpoint.ipynb ├── RL_1.ipynb ├── RL_1.slides.html ├── img │ ├── 1.png │ ├── 10.png │ ├── 11.png │ ├── 12.png │ ├── 13.png │ ├── 2.png │ ├── 3.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ ├── 9.png │ ├── e1.png │ ├── e2.png │ ├── e3.png │ └── e4.png ├── readme.ipynb └── readme.md ├── ch_2_rl_in_non_associative ├── .ipynb_checkpoints │ ├── RL_2-checkpoint.ipynb │ ├── tutorial-checkpoint.ipynb │ └── tutorial_solutions-checkpoint.ipynb ├── RL_2.ipynb ├── RL_2.slides.html ├── img │ ├── UCB.JPG │ ├── com_2.jpg │ ├── grad.jpg │ ├── greedyvs.jpg │ ├── mistake.JPG │ ├── mistake_1.JPG │ ├── mistake_2.JPG │ ├── multiarmedbandit.jpg │ ├── optinit.JPG │ └── testbed.JPG ├── readme.md ├── references.md ├── tutorial.ipynb └── tutorial_solutions.ipynb ├── ch_3_rl_finite_mdp ├── .ipynb_checkpoints │ ├── RL_3 - Copy-checkpoint.ipynb │ └── RL_3-checkpoint.ipynb ├── RL_3.ipynb ├── RL_3.slides.html ├── img │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 61.png │ ├── 7.png │ ├── agent_env.PNG │ ├── b1.png │ ├── b2.png │ ├── b3.png │ ├── b4.png │ ├── b5.png │ ├── o.png │ ├── o1.png │ ├── o2.png │ ├── op1.png │ ├── op2.png │ ├── op3.png │ ├── op4.png │ ├── pic1.png │ ├── pic2.png │ ├── return.png │ ├── robot.jpg │ ├── slides_1.PNG │ ├── slides_10.PNG │ ├── slides_2.PNG │ ├── slides_3.PNG │ ├── slides_4.PNG │ ├── slides_5.PNG │ ├── slides_6.PNG │ ├── slides_7.PNG │ ├── slides_8.PNG │ ├── slides_9.PNG │ └── unified.png └── readme.md ├── ch_4_rl_dynamic_programming ├── .ipynb_checkpoints │ ├── RL_4-checkpoint.ipynb │ └── readme-checkpoint.ipynb ├── RL_4.ipynb ├── RL_4.slides.html ├── img │ ├── a.png │ ├── aaa.png │ ├── aaaa.png │ ├── async_1.PNG │ ├── async_2.PNG │ ├── async_3.PNG │ ├── async_ex_1.PNG │ ├── async_ex_2.PNG │ ├── async_ex_3.PNG │ ├── async_ex_4.PNG │ ├── b.png │ ├── b1.png │ ├── b2.png │ ├── contr_1.PNG │ ├── contr_2.PNG │ ├── contr_3.PNG │ ├── d.png │ ├── dp_ex_1.PNG │ ├── e.png │ ├── e1.png │ ├── e2.png │ ├── e3.png │ ├── p1.png │ ├── p2.png │ ├── s1.png │ ├── s11.png │ ├── s2.png │ ├── s3.png │ ├── sa.png │ ├── sb.png │ ├── v1.png │ ├── v2.png │ ├── v3.png │ ├── v4.png │ └── v5.png └── readme.md ├── ch_5_rl_mc_methods ├── .ipynb_checkpoints │ ├── RL_5-checkpoint.ipynb │ └── readme-checkpoint.ipynb ├── RL_5.ipynb ├── RL_5.slides.html ├── img │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── a1.png │ ├── a2.png │ ├── a3.png │ ├── a4.png │ ├── a5.png │ ├── a6.png │ ├── a7.png │ ├── c1.png │ ├── c2.png │ ├── c3.png │ ├── c4.png │ ├── c5.png │ ├── c6.png │ ├── c7.png │ ├── imp_sam_1.PNG │ ├── imp_sam_2.PNG │ ├── imp_sam_3.PNG │ ├── imp_sam_4.PNG │ └── imp_sam_5.PNG ├── readme.ipynb └── readme.md ├── ch_6_td_methods ├── RL_6.ipynb ├── RL_6.slides.html ├── img │ ├── 10_2.png │ ├── DPback.JPG │ ├── MCback.JPG │ ├── TDback.JPG │ ├── TDex1.JPG │ ├── backup_q.JPG │ ├── batch_td.PNG │ ├── bootsam.JPG │ ├── cliff.jpg │ ├── doubleq.JPG │ ├── ex_sarsa.JPG │ ├── maxbias.JPG │ ├── mcvstd.PNG │ ├── mcvstd_2.PNG │ ├── qvssarsa.JPG │ ├── tdex2.JPG │ └── tdmarkov.jpg └── readme.md ├── ch_7_rl_eligibility_traces ├── .ipynb_checkpoints │ └── RL_7-checkpoint.ipynb ├── RL_7.ipynb ├── RL_7.slides.html ├── img │ ├── 1.png │ ├── 10.png │ ├── 10_1.png │ ├── 10_2.png │ ├── 11.png │ ├── 12.png │ ├── 13.png │ ├── 14.png │ ├── 15.png │ ├── 16.png │ ├── 17.png │ ├── 18.png │ ├── 19.png │ ├── 2.png │ ├── 20.png │ ├── 21.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ ├── 9.png │ ├── bp.png │ ├── eg1.png │ ├── eq1.png │ ├── eq2.png │ ├── importancesampling.png │ ├── nStepOnline.png │ ├── prediction.png │ ├── return.png │ ├── return1.png │ ├── sc1.png │ ├── sc2.png │ ├── update.png │ └── update1.png └── readme.md ├── ch_8_model_based ├── RL_8.ipynb ├── RL_8.slides.html ├── img │ ├── dyna.jpeg │ ├── dyna_eq.JPG │ ├── dyna_perf.JPG │ ├── dyna_perf2.JPG │ ├── dynaenvchange.JPG │ ├── dynaenvchange2.JPG │ ├── mcts0.JPG │ ├── mcts1.JPG │ ├── mcts2.JPG │ ├── mcts3.JPG │ ├── mcts4.JPG │ ├── mcts5.JPG │ ├── mctssearch1.JPG │ ├── mctssearch2.JPG │ ├── mctssteps.JPG │ ├── modelbasedplanning.JPG │ ├── psweep.JPG │ ├── psweep_ex.JPG │ ├── simmontesearch.JPG │ └── sslearning.JPG └── readme.md ├── ch_9_func_approx_1 ├── RL_FA1.ipynb ├── RL_FA1.slides.html ├── img │ ├── fa_avg1.JPG │ ├── fa_avg2.JPG │ ├── fa_avg3.JPG │ ├── fa_avg4.JPG │ ├── fa_avg5.JPG │ ├── fa_avg6.JPG │ ├── fa_avg7.JPG │ ├── fa_avg8.JPG │ ├── fa_avg9.JPG │ ├── fa_prob1.JPG │ ├── fa_prob2.JPG │ ├── fa_prob3.JPG │ ├── fa_prob4.JPG │ ├── fa_prob5.JPG │ ├── fa_prob6.JPG │ ├── fa_prob7.JPG │ ├── fa_prob8.JPG │ ├── fa_slide1.JPG │ ├── fa_slides10.JPG │ ├── fa_slides11.JPG │ ├── fa_slides12.JPG │ ├── fa_slides13.JPG │ ├── fa_slides14.JPG │ ├── fa_slides2.JPG │ ├── fa_slides3.JPG │ ├── fa_slides4.JPG │ ├── fa_slides5.JPG │ ├── fa_slides6.JPG │ ├── fa_slides7.JPG │ ├── fa_slides8.JPG │ ├── fa_slides9.JPG │ └── func_approx.JPG └── readme.md ├── img ├── break_1.png ├── motivation.png └── statement_hinton_bengio_lecun.png └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | */.ipynb_checkpoints/ 5 | # C extensions 6 | *.so 7 | ## for the website 8 | _site/ 9 | .sass-cache/ 10 | .jekyll-metadata 11 | .DS_Store 12 | Gemfile.lock 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # PyBuilder 63 | target/ 64 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "reveal.js"] 2 | path = reveal.js 3 | url = https://github.com/hakimel/reveal.js.git 4 | branch = master 5 | [submodule "ch_2_rl_in_non_associative/multi_arm_bandits"] 6 | path = ch_2_rl_in_non_associative/multi_arm_bandits 7 | url = https://github.com/BardOfCodes/multi_arm_bandits.git 8 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Aditya Ganeshan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_1.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_10.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_11.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_12.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_2.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_3.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_4.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_5.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_6.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_7.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_8.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/1_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/1_9.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_1.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_10.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_11.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_13.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_2.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_3.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_4.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_5.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_7.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_8.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/2_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/2_9.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/3.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/4.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/5.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/6.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/7.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/8.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/a.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/adnet_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_1.PNG -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/adnet_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_2.PNG -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/adnet_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_3.PNG -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/adnet_4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_4.PNG -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/adnet_5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_5.PNG -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/adnet_6.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_6.PNG -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/adnet_7.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_7.PNG -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/adnet_8.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/adnet_8.PNG -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/b.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/c.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/d.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/e.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/f.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/x.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/y.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/y.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/img/z.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_0_rl_rl_in_cv/img/z.png -------------------------------------------------------------------------------- /ch_0_rl_rl_in_cv/readme.md: -------------------------------------------------------------------------------- 1 | #
Deep Reinforcement Learning in CV (3 papers)
2 | 3 | ## Contents: 4 | 5 | 1) **We will look at the following tasks:** 6 | * Object Detection 7 | * Paper: Caicedo, Juan C., and Svetlana Lazebnik. "Active object localization with deep reinforcement learning." Proceedings of the IEEE International Conference on Computer Vision. 2015. 8 | * Action Detection 9 | * Paper: Huang, Jingjia, et al. "A Self-Adaptive Proposal Model for Temporal Action Detection based on Reinforcement Learning." arXiv preprint arXiv:1706.07251 (2017). 10 | * Visual Tracking 11 | * Paper: Yoo, Sangdoo Yun1 Jongwon Choi1 Youngjoon, Kimin Yun, and Jin Young Choi. "Action-Decision Networks for Visual Tracking with Deep Reinforcement Learning". 12 | 13 | 2) **For each Task we answer these:** 14 | * What is the task? 15 | * Can we identify the RL components: 16 | * State Space 17 | * Action Space 18 | * Reward System 19 | * Network Architecture 20 | * Why use RL for this Tasks 21 | 22 | -------------------------------------------------------------------------------- /ch_10_func_approx_2/RL_FA2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "#
Value Function Approximation
\n", 12 | "##
Part II
\n", 13 | "##
Reference: Sutton and Barto, Chapter 9-11
\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "slideshow": { 20 | "slide_type": "slide" 21 | } 22 | }, 23 | "source": [ 24 | "##
Table of Contents
\n", 25 | "
\n", 26 | "\n", 27 | "* **Batch Reinforcement Methods**

\n", 28 | "\n", 29 | "* **Least Squares Policy Iteration(LSPI)**

\n", 30 | "\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "slideshow": { 37 | "slide_type": "slide" 38 | } 39 | }, 40 | "source": [ 41 | "#
Batch Reinforcement Methods
" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": { 47 | "slideshow": { 48 | "slide_type": "slide" 49 | } 50 | }, 51 | "source": [ 52 | "##
Batch Reinforcement Methods
\n", 53 | "
\n", 54 | "* Gradient descent is simple and appealing

\n", 55 | "* But it is not sample efficient

\n", 56 | "* Batch methods seek to find the best fitting value function

\n", 57 | "* Given the agent’s experience (“training data”)

" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "slideshow": { 64 | "slide_type": "subslide" 65 | } 66 | }, 67 | "source": [ 68 | "##
Least Squares Prediction
\n", 69 | "\n", 70 | "
\"Multi-armed
" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "slideshow": { 77 | "slide_type": "subslide" 78 | } 79 | }, 80 | "source": [ 81 | "##
Stochastic Gradient Descent with Experience Replay
\n", 82 | "\n", 83 | "
\"Multi-armed
" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "slideshow": { 90 | "slide_type": "subslide" 91 | } 92 | }, 93 | "source": [ 94 | "##
Experience Replay in Deep Q-Networks (DQN)
\n", 95 | "\n", 96 | "
\"Multi-armed
" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "#
DQN in ATARI
" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "slideshow": { 114 | "slide_type": "slide" 115 | } 116 | }, 117 | "source": [ 118 | "## The model\n", 119 | "\n", 120 | "
\"Multi-armed
" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "slideshow": { 127 | "slide_type": "subslide" 128 | } 129 | }, 130 | "source": [ 131 | "## Performance\n", 132 | "\n", 133 | "
\"Multi-armed
" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": { 139 | "slideshow": { 140 | "slide_type": "subslide" 141 | } 142 | }, 143 | "source": [ 144 | "## Benefits of Experience Replay and Double DQN\n", 145 | "\n", 146 | "
\"Multi-armed
" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "slideshow": { 153 | "slide_type": "slide" 154 | } 155 | }, 156 | "source": [ 157 | "## DQN Example and Code\n", 158 | "
\"Multi-armed
\n", 159 | "\n", 160 | "#### CartPole Example\n", 161 | "The agent has to decide between two actions - moving the cart left or right - so that the pole attached to it stays upright.\n", 162 | "\n", 163 | "##### State Space\n", 164 | "State is the difference between the current screen patch and the previous one. This will allow the agent to take the velocity of the pole into account from one image." 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "slideshow": { 171 | "slide_type": "subslide" 172 | } 173 | }, 174 | "source": [ 175 | "##### Q-network\n", 176 | "\n", 177 | "* Our model will be a convolutional neural network that takes in the difference between the current and previous screen patches. \n", 178 | "* It has two outputs, representing Q(s,left) and Q(s,right) (where s is the input to the network). \n", 179 | "* In effect, the network is trying to predict the quality of taking each action given the current input.\n", 180 | "
\"Multi-armed
" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": { 186 | "slideshow": { 187 | "slide_type": "subslide" 188 | } 189 | }, 190 | "source": [ 191 | "##### Replay Memory\n", 192 | "* Experience replay memory is used for training the DQN. \n", 193 | "* It stores the transitions that the agent observes, allowing us to reuse this data later. \n", 194 | "* By sampling from it randomly, the transitions that build up a batch are decorrelated. \n", 195 | "* It has been shown that this greatly stabilizes and improves the DQN training procedure.\n", 196 | "
\"Multi-armed
" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "slideshow": { 203 | "slide_type": "subslide" 204 | } 205 | }, 206 | "source": [ 207 | "##### Input Extraction\n", 208 | "\n", 209 | "How do we get the crop of the cart?\n", 210 | "\n", 211 | "
\"Multi-armed
" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": { 217 | "slideshow": { 218 | "slide_type": "subslide" 219 | } 220 | }, 221 | "source": [ 222 | "##### Selecting an Action\n", 223 | "\n", 224 | "This is done based on $\\epsilon$ greedy policy.\n", 225 | "
\"Multi-armed
" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": { 231 | "slideshow": { 232 | "slide_type": "subslide" 233 | } 234 | }, 235 | "source": [ 236 | "##### Training\n", 237 | "
\"Multi-armed
\n", 238 | "
\"Multi-armed
" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "slideshow": { 245 | "slide_type": "subslide" 246 | } 247 | }, 248 | "source": [ 249 | "
\"Multi-armed
" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": { 255 | "slideshow": { 256 | "slide_type": "slide" 257 | } 258 | }, 259 | "source": [ 260 | "#
Linear Least Squares Prediction
" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": { 266 | "slideshow": { 267 | "slide_type": "slide" 268 | } 269 | }, 270 | "source": [ 271 | "##
Linear Least Squares Prediction
\n", 272 | "

\n", 273 | "* Experience replay finds least squares solution

\n", 274 | "* But it may take many iterations

\n", 275 | "* Using linear value function approximation $\\hat{v}(s, w) = x(s)^Tw$

\n", 276 | "* We can solve the least squares solution directly" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": { 282 | "slideshow": { 283 | "slide_type": "subslide" 284 | } 285 | }, 286 | "source": [ 287 | "##
Linear Least Squares Prediction
\n", 288 | "\n", 289 | "
\"Multi-armed
" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": { 295 | "slideshow": { 296 | "slide_type": "subslide" 297 | } 298 | }, 299 | "source": [ 300 | "##
Linear Least Squares Prediction Algorithms
\n", 301 | "\n", 302 | "
\"Multi-armed
" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": { 309 | "collapsed": true 310 | }, 311 | "outputs": [], 312 | "source": [] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": { 317 | "slideshow": { 318 | "slide_type": "subslide" 319 | } 320 | }, 321 | "source": [ 322 | "##
Linear Least Squares Prediction Algorithms
\n", 323 | "\n", 324 | "
\"Multi-armed
" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": { 330 | "slideshow": { 331 | "slide_type": "subslide" 332 | } 333 | }, 334 | "source": [ 335 | "##
Least Squares Policy Iteration(LSPI)
\n", 336 | "\n", 337 | "
\"Multi-armed
" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": { 343 | "slideshow": { 344 | "slide_type": "subslide" 345 | } 346 | }, 347 | "source": [ 348 | "##
Least Squares Action-Value Function Approximation
\n", 349 | "\n", 350 | "
\"Multi-armed
" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": { 356 | "slideshow": { 357 | "slide_type": "subslide" 358 | } 359 | }, 360 | "source": [ 361 | "##
Least Squares Control
\n", 362 | "\n", 363 | "
\"Multi-armed
" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": { 369 | "slideshow": { 370 | "slide_type": "subslide" 371 | } 372 | }, 373 | "source": [ 374 | "##
Least Squares Q-Learning
\n", 375 | "\n", 376 | "
\"Multi-armed
" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": { 382 | "slideshow": { 383 | "slide_type": "subslide" 384 | } 385 | }, 386 | "source": [ 387 | "\n", 388 | "##
Least Squares Policy Iteration(LSPI) Algorithm
\n", 389 | "\n", 390 | "
\"Multi-armed
" 391 | ] 392 | } 393 | ], 394 | "metadata": { 395 | "anaconda-cloud": {}, 396 | "celltoolbar": "Slideshow", 397 | "kernelspec": { 398 | "display_name": "Python [conda root]", 399 | "language": "python", 400 | "name": "conda-root-py" 401 | }, 402 | "language_info": { 403 | "codemirror_mode": { 404 | "name": "ipython", 405 | "version": 3 406 | }, 407 | "file_extension": ".py", 408 | "mimetype": "text/x-python", 409 | "name": "python", 410 | "nbconvert_exporter": "python", 411 | "pygments_lexer": "ipython3", 412 | "version": "3.5.2" 413 | }, 414 | "widgets": { 415 | "state": {}, 416 | "version": "1.1.2" 417 | } 418 | }, 419 | "nbformat": 4, 420 | "nbformat_minor": 2 421 | } 422 | -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/1.png -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/2.png -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/3.png -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/4.png -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/5.png -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/6.png -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/7.png -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/ex.png -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_ex1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_ex1.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_ex2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_ex2.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_ex3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_ex3.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides1.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides10.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides10.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides11.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides11.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides12.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides12.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides2.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides3.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides4.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides5.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides5.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides6.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides6.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides7.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides7.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides8.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides8.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/img/fa2_slides9.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_10_func_approx_2/img/fa2_slides9.JPG -------------------------------------------------------------------------------- /ch_10_func_approx_2/readme.md: -------------------------------------------------------------------------------- 1 | #
Value Function Approximation (Part II)
2 | 3 | ####
Reference: Chapter 9 to Chapter 11, Sutton and Barto
4 | 5 | ## Contents: 6 | 7 | 1) **Batch Reinforcement Methods** 8 | * Least Squares Prediction 9 | * SGD with Experience Replay 10 | * Experience Replay in Deep Q-Networks (DQN) 11 | * DQN in Atari Games 12 | 13 | 2) **Linear Least Squares Prediction** 14 | -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_1.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_10.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_10.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_11.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_11.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_12.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_12.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_13.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_13.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_14.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_14.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_15.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_15.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_16.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_16.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_17.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_17.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_18.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_18.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_19.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_19.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_2.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_20.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_20.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_21.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_21.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_22.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_22.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_23.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_23.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_24.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_24.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_25.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_25.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_26.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_26.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_27.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_27.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_28.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_28.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_29.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_29.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_3.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_30.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_30.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_34.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_34.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_4.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_5.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_5.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_6.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_6.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_7.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_7.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_8.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_8.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/pg_9.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/pg_9.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/sutton_1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_1.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/sutton_2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_2.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/sutton_3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_3.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/sutton_4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_4.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/img/sutton_5.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_11_policy_gradient/img/sutton_5.JPG -------------------------------------------------------------------------------- /ch_11_policy_gradient/readme.md: -------------------------------------------------------------------------------- 1 | #
Policy Gradient Methods
2 | 3 | ## Contents: 4 | 5 | 1) **Introduction** 6 | * Policy Gradient vs Value Approximators 7 | * Advantages/Disadvantages 8 | 9 | 2) **REINFORCE: Simplest Policy Gradient Method** 10 | 11 | 3) **Actor-Critic Methods** 12 | 13 | 4) **Enhancements to Actor-Critic Method** 14 | -------------------------------------------------------------------------------- /ch_1_rl_intro/.ipynb_checkpoints/RL_1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "#
Introduction to Reinforcement Learning
\n", 12 | "\n", 13 | "###
Reference: Chapter 1, Sutton and Barto
\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "slideshow": { 20 | "slide_type": "slide" 21 | } 22 | }, 23 | "source": [ 24 | "#
Contents
\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "fragment" 32 | } 33 | }, 34 | "source": [ 35 | "* RL: Formal Definition" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "slideshow": { 42 | "slide_type": "fragment" 43 | } 44 | }, 45 | "source": [ 46 | "* RL vs Supervised Learning vs Unsupervised learning" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "slideshow": { 53 | "slide_type": "fragment" 54 | } 55 | }, 56 | "source": [ 57 | "* Important RL Perspectives\n", 58 | " * Goal (Reward Hypothesis)\n", 59 | " * Sequential Decision Making Problem\n", 60 | " * Interaction between Agent and Environment\n", 61 | " " 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "slideshow": { 68 | "slide_type": "fragment" 69 | } 70 | }, 71 | "source": [ 72 | "* Components of RL Agent" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "slideshow": { 79 | "slide_type": "fragment" 80 | } 81 | }, 82 | "source": [ 83 | "* RL Problems: Learning and Planning" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "slideshow": { 90 | "slide_type": "fragment" 91 | } 92 | }, 93 | "source": [ 94 | "* Prediction and Control" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "slideshow": { 101 | "slide_type": "slide" 102 | } 103 | }, 104 | "source": [ 105 | "#
RL Formal Definition
" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": { 111 | "slideshow": { 112 | "slide_type": "subslide" 113 | } 114 | }, 115 | "source": [ 116 | " \"Reinforcement learning is the problem of getting an agent to act in the world so as to maximize its rewards. For example, consider teaching a dog a new trick: you cannot tell it what to do, but you can reward/punish it if it does the right/wrong thing. It has to figure out what it did that made it get the reward/punishment, which is known as the credit assignment problem.\"" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": { 122 | "slideshow": { 123 | "slide_type": "subslide" 124 | } 125 | }, 126 | "source": [ 127 | "
\"Example1\"/
" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": { 133 | "slideshow": { 134 | "slide_type": "subslide" 135 | } 136 | }, 137 | "source": [ 138 | "
\"Example2\"/
" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "slideshow": { 145 | "slide_type": "slide" 146 | } 147 | }, 148 | "source": [ 149 | "#
RL vs Supervised Learning vs Unsupervised Learning
" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": { 155 | "slideshow": { 156 | "slide_type": "subslide" 157 | } 158 | }, 159 | "source": [ 160 | "## Supervised Learning\n", 161 | "\n", 162 | "1) A human builds a classifier based on input and output data\n", 163 | "\n", 164 | "2) That classifier is trained with a training set of data\n", 165 | "\n", 166 | "3) That classifier is tested with a test set of data\n", 167 | "\n", 168 | "4) Deployment if the output is satisfactory\n", 169 | "\n", 170 | "To be used when, \"I know how to classify this data, I just need you(the classifier) to sort it.\"\n", 171 | "\n", 172 | "Point of method: To class labels or to produce real numbers" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "slideshow": { 179 | "slide_type": "subslide" 180 | } 181 | }, 182 | "source": [ 183 | "## Unsupervised Learning\n", 184 | "\n", 185 | "\n", 186 | "1) A human builds an algorithm based on input data\n", 187 | "\n", 188 | "2) That algorithm is tested with a test set of data (in which the algorithm creates the classifier)\n", 189 | "\n", 190 | "3) Deployment if the classifier is satisfactory\n", 191 | "\n", 192 | "To be used when, \"I have no idea how to classify this data, can you(the algorithm) create a classifier for me?\"\n", 193 | "Point of method: To class labels or to predict" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "slideshow": { 200 | "slide_type": "subslide" 201 | } 202 | }, 203 | "source": [ 204 | "## Reinforcement Learning\n", 205 | "\n", 206 | "\n", 207 | "1) A human builds an algorithm based on input data\n", 208 | "\n", 209 | "2) That algorithm presents a state dependent on the input data in which a user rewards or punishes the algorithm via the action the algorithm took, this continues over time\n", 210 | "\n", 211 | "3) That algorithm learns from the reward/punishment and updates itself, this continues\n", 212 | "\n", 213 | "4) It's always in production, it needs to learn real data to be able to present actions from states\n", 214 | "\n", 215 | "To be used when, \"I have no idea how to classify this data, can you classify this data and I'll give you a reward if it's correct or I'll punish you if it's not.\"\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": { 221 | "slideshow": { 222 | "slide_type": "subslide" 223 | } 224 | }, 225 | "source": [ 226 | "### RL vs Supervised Learning\n", 227 | "* Training Examples\n", 228 | " * Supervised Learning: Training Examples of the formNo training examples from a knowledgeable external supervisor (situation together with a label).\n", 229 | " * RL: No such training examples.\n", 230 | "* Objective Functions\n", 231 | " * Supervised Learning: Aim is to extrapolate, or generalize so that it acts correctly in situations not present in the training set. \n", 232 | " * In RL, it is often impractical to obtain examples of desired behavior that are both correct and representative of all the situations and an agent must be able to learn from its own experience." 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "slideshow": { 239 | "slide_type": "subslide" 240 | } 241 | }, 242 | "source": [ 243 | "### RL vs Unsupervised Learning\n", 244 | "* Unsupervised Learning is about finding structure hidden in collections of unlabeled data.\n", 245 | "* Uncovering structure in an agent’s experience can certainly be useful in reinforcement learning, but by itself does not address the reinforcement learning agent’s problem of maximizing a reward signal." 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "slideshow": { 252 | "slide_type": "subslide" 253 | } 254 | }, 255 | "source": [ 256 | "### Examples of RL\n", 257 | "* Fly stunt manoeuvres in a helicopter\n", 258 | "* Defeat the world champion at Backgammon\n", 259 | "* Manage an investment portfolio\n", 260 | "* Control a power station\n", 261 | "* Make a humanoid robot walk\n", 262 | "* Play many different Atari games better than humans" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": { 268 | "slideshow": { 269 | "slide_type": "slide" 270 | } 271 | }, 272 | "source": [ 273 | "#
Important RL Perspectives
" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": { 279 | "slideshow": { 280 | "slide_type": "slide" 281 | } 282 | }, 283 | "source": [ 284 | "## Goal of RL (Reward Hypothesis)\n", 285 | "
\"RewardHypothesis\"
\n" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "slideshow": { 292 | "slide_type": "subslide" 293 | } 294 | }, 295 | "source": [ 296 | "## Reward Examples\n", 297 | "* stunt manoeuvres in a helicopter\n", 298 | " * +ve reward for following desired trajectory\n", 299 | " * −ve reward for crashing\n", 300 | "* Defeat the world champion at Backgammon\n", 301 | " * +/−ve reward for winning/losing a game\n", 302 | "* Manage an investment portfolio\n", 303 | " * +ve reward for each dollar in bank\n", 304 | "* Control a power station\n", 305 | " * +ve reward for producing power\n", 306 | " * −ve reward for exceeding safety thresholds\n", 307 | "* Make a humanoid robot walk\n", 308 | " * +ve reward for forward motion\n", 309 | " * −ve reward for falling over\n", 310 | "* Play many different Atari games better than humans\n", 311 | " * +/−ve reward for increasing/decreasing score" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": { 317 | "slideshow": { 318 | "slide_type": "slide" 319 | } 320 | }, 321 | "source": [ 322 | "## Sequential Decision Making Problem\n", 323 | "* Goal: select actions to maximise total future reward\n", 324 | "* Actions may have long term consequences\n", 325 | "* Reward may be delayed\n", 326 | "* It may be better to sacrifice immediate reward to gain more long-term reward\n", 327 | "* Examples:\n", 328 | " * A financial investment (may take months to mature)\n", 329 | " * Refuelling a helicopter (might prevent a crash in several hours)\n", 330 | " * Blocking opponent moves (might help winning chances many moves from now)\n" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": { 336 | "slideshow": { 337 | "slide_type": "slide" 338 | } 339 | }, 340 | "source": [ 341 | "## Interaction between Agent and Environment\n", 342 | "
\"RewardHypothesis\"
" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": { 348 | "slideshow": { 349 | "slide_type": "subslide" 350 | } 351 | }, 352 | "source": [ 353 | "## Interaction between Agent and Environment\n", 354 | "
\"RewardHypothesis\"
" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": { 360 | "slideshow": { 361 | "slide_type": "subslide" 362 | } 363 | }, 364 | "source": [ 365 | "## History and State\n", 366 | "
\"HistoryandState\"
" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "slideshow": { 373 | "slide_type": "subslide" 374 | } 375 | }, 376 | "source": [ 377 | "## Environment State\n", 378 | "
\"HistoryandState\"
" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": { 384 | "slideshow": { 385 | "slide_type": "subslide" 386 | } 387 | }, 388 | "source": [ 389 | "## Agent State\n", 390 | "
\"HistoryandState\"
" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": { 396 | "slideshow": { 397 | "slide_type": "subslide" 398 | } 399 | }, 400 | "source": [ 401 | "## Information State\n", 402 | "
\"HistoryandState\"
" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "slideshow": { 409 | "slide_type": "subslide" 410 | } 411 | }, 412 | "source": [ 413 | "## Fully Observable Environment\n", 414 | "
\"HistoryandState\"
" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": { 420 | "slideshow": { 421 | "slide_type": "subslide" 422 | } 423 | }, 424 | "source": [ 425 | "## Partially Observable Environment\n", 426 | "
\"HistoryandState\"
" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": { 432 | "slideshow": { 433 | "slide_type": "slide" 434 | } 435 | }, 436 | "source": [ 437 | "## Major Components of a RL Agent\n", 438 | "An RL agent may include one or more of these components:\n", 439 | "* **Policy**: agent’s behaviour function\n", 440 | "* **Value function**: how good is each state and/or action\n", 441 | "* **Model**: agent’s representation of the environment" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": { 447 | "slideshow": { 448 | "slide_type": "subslide" 449 | } 450 | }, 451 | "source": [ 452 | "## Policy\n", 453 | "* A policy is the agent’s behaviour\n", 454 | "* It is a map from state to action, e.g.\n", 455 | "* Deterministic policy: $a = π(s)$\n", 456 | "* Stochastic policy: $π(a|s) = P[A_t = a|S_t = s]$" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": { 462 | "slideshow": { 463 | "slide_type": "subslide" 464 | } 465 | }, 466 | "source": [ 467 | "## Value function\n", 468 | "* Value function is a prediction of future reward\n", 469 | "* Used to evaluate the goodness/badness of states\n", 470 | "* And therefore to select between actions, e.g.\n", 471 | "$$v_π(s) = E_π[R _{t+1} + γ*R_{t+2} + γ^{2}*R_{t+3} + ... | S_t = s]$$" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": { 477 | "slideshow": { 478 | "slide_type": "subslide" 479 | } 480 | }, 481 | "source": [ 482 | "## Model\n", 483 | "* A model predicts what the environment will do next\n", 484 | "* **Transitions**: P predicts the next state (i.e. dynamics)\n", 485 | "* **Rewards**: R predicts the next (immediate) reward, e.g.\n", 486 | "$$ P_{ss'} = P[S_{t+1} = s' | S_t = s, A_t = a]$$\n", 487 | "$$ R^{a}_{s} = E[R_{t+1} | S_t = s, A_t = a]$$" 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": { 493 | "slideshow": { 494 | "slide_type": "subslide" 495 | } 496 | }, 497 | "source": [ 498 | "## Maze Example\n", 499 | "
\"HistoryandState\"
\n" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": { 505 | "slideshow": { 506 | "slide_type": "subslide" 507 | } 508 | }, 509 | "source": [ 510 | "# Maze Example: Policy\n", 511 | "
\"HistoryandState\"
\n", 512 | "\n", 513 | "* Arrows represent policy $\\pi(s)$ for each state s. " 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": { 519 | "slideshow": { 520 | "slide_type": "subslide" 521 | } 522 | }, 523 | "source": [ 524 | "# Maze Example: Value Function\n", 525 | "
\"HistoryandState\"
" 526 | ] 527 | }, 528 | { 529 | "cell_type": "markdown", 530 | "metadata": { 531 | "slideshow": { 532 | "slide_type": "subslide" 533 | } 534 | }, 535 | "source": [ 536 | "# Maze Example: Model\n", 537 | "
\"HistoryandState\"
" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": { 543 | "slideshow": { 544 | "slide_type": "slide" 545 | } 546 | }, 547 | "source": [ 548 | "## Learning and Planning\n", 549 | "Two fundamental problems in sequential decision making:\n", 550 | "* Reinforcement Learning:\n", 551 | " * The environment is initially unknown\n", 552 | " * The agent interacts with the environment\n", 553 | " * The agent improves its policy\n", 554 | "* Planning:\n", 555 | " * A model of the environment is known\n", 556 | " * The agent performs computations with its model (without any external interaction)\n", 557 | " * The agent improves its policy a.k.a. deliberation, reasoning, introspection, pondering, thought, search" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": { 564 | "collapsed": true, 565 | "slideshow": { 566 | "slide_type": "slide" 567 | } 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "## Exploration and Exploitation\n", 572 | "\n", 573 | "* Reinforcement learning is like trial-and-error learning\n", 574 | "\n", 575 | "* The agent should discover a good policy,\n", 576 | " * From its experiences of the environment,\n", 577 | " * Without losing too much reward along the way\n", 578 | "\n", 579 | "* **Exploration** finds more information about the environment\n", 580 | "\n", 581 | "* **Exploitation** exploits known information to maximise reward\n", 582 | "\n", 583 | "* It is usually important to explore as well as exploit (In Detail => Chapter 2)" 584 | ] 585 | }, 586 | { 587 | "cell_type": "markdown", 588 | "metadata": { 589 | "slideshow": { 590 | "slide_type": "subslide" 591 | } 592 | }, 593 | "source": [ 594 | "## Examples of Explortion and Exploitation\n", 595 | "\n", 596 | "* Restaurant Selection\n", 597 | " * Exploitation Go to your favourite restaurant\n", 598 | " * Exploration Try a new restaurant\n", 599 | "* Online Banner Advertisements\n", 600 | " * Exploitation Show the most successful advert\n", 601 | " * Exploration Show a different advert\n", 602 | "* Oil Drilling\n", 603 | " * Exploitation Drill at the best known location\n", 604 | " * Exploration Drill at a new location\n", 605 | "* Game Playing\n", 606 | " * Exploitation Play the move you believe is best\n", 607 | " * Exploration Play an experimental move" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": { 613 | "slideshow": { 614 | "slide_type": "slide" 615 | } 616 | }, 617 | "source": [ 618 | "## Summary\n", 619 | "\n", 620 | "* We got introduced to the basic terminologies of RL.\n", 621 | "\n", 622 | "\n", 623 | "* We got an intuition behind how an RL agent can solve problems.\n", 624 | "\n" 625 | ] 626 | } 627 | ], 628 | "metadata": { 629 | "celltoolbar": "Slideshow", 630 | "kernelspec": { 631 | "display_name": "Python 2", 632 | "language": "python", 633 | "name": "python2" 634 | }, 635 | "language_info": { 636 | "codemirror_mode": { 637 | "name": "ipython", 638 | "version": 2 639 | }, 640 | "file_extension": ".py", 641 | "mimetype": "text/x-python", 642 | "name": "python", 643 | "nbconvert_exporter": "python", 644 | "pygments_lexer": "ipython2", 645 | "version": "2.7.12" 646 | } 647 | }, 648 | "nbformat": 4, 649 | "nbformat_minor": 2 650 | } 651 | -------------------------------------------------------------------------------- /ch_1_rl_intro/.ipynb_checkpoints/readme-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Reinforcement Learning" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Reference: Chapter 1, Sutton and Barto" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Contents:" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "1) **RL: Formal Definition**\n", 29 | "* Recent Examples\n", 30 | "\n", 31 | "2) **RL vs Supervised Learning vs Unsupervised learning**\n", 32 | "\n", 33 | "3) **Important RL Perspectives**\n", 34 | "* Goal (Reward Hypothesis)\n", 35 | "* Sequential Decision Making Problem\n", 36 | "* Interaction between Agent and Environment\n", 37 | "\n", 38 | "4) **Components of RL Agent**\n", 39 | "* Policy\n", 40 | "* Value Function\n", 41 | "* Model\n", 42 | "\n", 43 | "5) **RL Problems: Learning and Planning**\n", 44 | "\n", 45 | "6) **Exploration vs Exploitation**\n", 46 | "\n", 47 | "7) **Prediction and Control**\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Summary" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "* We got introduced to the basic terminologies of RL.\n", 62 | "\n", 63 | "\n", 64 | "* We saw how Reinforcement learning is different from other forms of learning.\n", 65 | "\n" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 2", 72 | "language": "python", 73 | "name": "python2" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 2 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython2", 85 | "version": "2.7.12" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /ch_1_rl_intro/RL_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "#
Introduction to Reinforcement Learning
\n", 12 | "\n", 13 | "###
Reference: Chapter 1, Sutton and Barto
\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "slideshow": { 20 | "slide_type": "slide" 21 | } 22 | }, 23 | "source": [ 24 | "#
Contents
\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "fragment" 32 | } 33 | }, 34 | "source": [ 35 | "* RL: Formal Definition" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "slideshow": { 42 | "slide_type": "fragment" 43 | } 44 | }, 45 | "source": [ 46 | "* RL vs Supervised Learning vs Unsupervised learning" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "slideshow": { 53 | "slide_type": "fragment" 54 | } 55 | }, 56 | "source": [ 57 | "* Important RL Perspectives\n", 58 | " * Goal (Reward Hypothesis)\n", 59 | " * Sequential Decision Making Problem\n", 60 | " * Interaction between Agent and Environment\n", 61 | " " 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "slideshow": { 68 | "slide_type": "fragment" 69 | } 70 | }, 71 | "source": [ 72 | "* Components of RL Agent" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "slideshow": { 79 | "slide_type": "fragment" 80 | } 81 | }, 82 | "source": [ 83 | "* RL Problems: Learning and Planning" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "slideshow": { 90 | "slide_type": "fragment" 91 | } 92 | }, 93 | "source": [ 94 | "* Prediction and Control" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "slideshow": { 101 | "slide_type": "slide" 102 | } 103 | }, 104 | "source": [ 105 | "#
RL Formal Definition
" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": { 111 | "slideshow": { 112 | "slide_type": "subslide" 113 | } 114 | }, 115 | "source": [ 116 | " \"Reinforcement learning is the problem of getting an agent to act in the world so as to maximize its rewards. For example, consider teaching a dog a new trick: you cannot tell it what to do, but you can reward/punish it if it does the right/wrong thing. It has to figure out what it did that made it get the reward/punishment, which is known as the credit assignment problem.\"" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": { 122 | "slideshow": { 123 | "slide_type": "subslide" 124 | } 125 | }, 126 | "source": [ 127 | "
\"Example1\"/
" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": { 133 | "slideshow": { 134 | "slide_type": "subslide" 135 | } 136 | }, 137 | "source": [ 138 | "
\"Example2\"/
" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "slideshow": { 145 | "slide_type": "slide" 146 | } 147 | }, 148 | "source": [ 149 | "#
RL vs Supervised Learning vs Unsupervised Learning
" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": { 155 | "slideshow": { 156 | "slide_type": "subslide" 157 | } 158 | }, 159 | "source": [ 160 | "## Supervised Learning\n", 161 | "\n", 162 | "1) A human builds a classifier based on input and output data\n", 163 | "\n", 164 | "2) That classifier is trained with a training set of data\n", 165 | "\n", 166 | "3) That classifier is tested with a test set of data\n", 167 | "\n", 168 | "4) Deployment if the output is satisfactory\n", 169 | "\n", 170 | "To be used when, \"I know how to classify this data, I just need you(the classifier) to sort it.\"\n", 171 | "\n", 172 | "Point of method: To class labels or to produce real numbers" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "slideshow": { 179 | "slide_type": "subslide" 180 | } 181 | }, 182 | "source": [ 183 | "## Unsupervised Learning\n", 184 | "\n", 185 | "\n", 186 | "1) A human builds an algorithm based on input data\n", 187 | "\n", 188 | "2) That algorithm is tested with a test set of data (in which the algorithm creates the classifier)\n", 189 | "\n", 190 | "3) Deployment if the classifier is satisfactory\n", 191 | "\n", 192 | "To be used when, \"I have no idea how to classify this data, can you(the algorithm) create a classifier for me?\"\n", 193 | "Point of method: To class labels or to predict" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "slideshow": { 200 | "slide_type": "subslide" 201 | } 202 | }, 203 | "source": [ 204 | "## Reinforcement Learning\n", 205 | "\n", 206 | "\n", 207 | "1) A human builds an algorithm based on input data\n", 208 | "\n", 209 | "2) That algorithm presents a state dependent on the input data in which a user rewards or punishes the algorithm via the action the algorithm took, this continues over time\n", 210 | "\n", 211 | "3) That algorithm learns from the reward/punishment and updates itself, this continues\n", 212 | "\n", 213 | "4) It's always in production, it needs to learn real data to be able to present actions from states\n", 214 | "\n", 215 | "To be used when, \"I have no idea how to classify this data, can you classify this data and I'll give you a reward if it's correct or I'll punish you if it's not.\"\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": { 221 | "slideshow": { 222 | "slide_type": "subslide" 223 | } 224 | }, 225 | "source": [ 226 | "### RL vs Supervised Learning\n", 227 | "* Training Examples\n", 228 | " * Supervised Learning: Training Examples of the formNo training examples from a knowledgeable external supervisor (situation together with a label).\n", 229 | " * RL: No such training examples.\n", 230 | "* Objective Functions\n", 231 | " * Supervised Learning: Aim is to extrapolate, or generalize so that it acts correctly in situations not present in the training set. \n", 232 | " * In RL, it is often impractical to obtain examples of desired behavior that are both correct and representative of all the situations and an agent must be able to learn from its own experience." 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "slideshow": { 239 | "slide_type": "subslide" 240 | } 241 | }, 242 | "source": [ 243 | "### RL vs Unsupervised Learning\n", 244 | "* Unsupervised Learning is about finding structure hidden in collections of unlabeled data.\n", 245 | "* Uncovering structure in an agent’s experience can certainly be useful in reinforcement learning, but by itself does not address the reinforcement learning agent’s problem of maximizing a reward signal." 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "slideshow": { 252 | "slide_type": "subslide" 253 | } 254 | }, 255 | "source": [ 256 | "### Examples of RL\n", 257 | "* Fly stunt manoeuvres in a helicopter\n", 258 | "* Defeat the world champion at Backgammon\n", 259 | "* Manage an investment portfolio\n", 260 | "* Control a power station\n", 261 | "* Make a humanoid robot walk\n", 262 | "* Play many different Atari games better than humans" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": { 268 | "slideshow": { 269 | "slide_type": "slide" 270 | } 271 | }, 272 | "source": [ 273 | "#
Important RL Perspectives
" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": { 279 | "slideshow": { 280 | "slide_type": "slide" 281 | } 282 | }, 283 | "source": [ 284 | "## Goal of RL (Reward Hypothesis)\n", 285 | "
\"RewardHypothesis\"
\n" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "slideshow": { 292 | "slide_type": "subslide" 293 | } 294 | }, 295 | "source": [ 296 | "## Reward Examples\n", 297 | "* stunt manoeuvres in a helicopter\n", 298 | " * +ve reward for following desired trajectory\n", 299 | " * −ve reward for crashing\n", 300 | "* Defeat the world champion at Backgammon\n", 301 | " * +/−ve reward for winning/losing a game\n", 302 | "* Manage an investment portfolio\n", 303 | " * +ve reward for each dollar in bank\n", 304 | "* Control a power station\n", 305 | " * +ve reward for producing power\n", 306 | " * −ve reward for exceeding safety thresholds\n", 307 | "* Make a humanoid robot walk\n", 308 | " * +ve reward for forward motion\n", 309 | " * −ve reward for falling over\n", 310 | "* Play many different Atari games better than humans\n", 311 | " * +/−ve reward for increasing/decreasing score" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": { 317 | "slideshow": { 318 | "slide_type": "slide" 319 | } 320 | }, 321 | "source": [ 322 | "## Sequential Decision Making Problem\n", 323 | "* Goal: select actions to maximise total future reward\n", 324 | "* Actions may have long term consequences\n", 325 | "* Reward may be delayed\n", 326 | "* It may be better to sacrifice immediate reward to gain more long-term reward\n", 327 | "* Examples:\n", 328 | " * A financial investment (may take months to mature)\n", 329 | " * Refuelling a helicopter (might prevent a crash in several hours)\n", 330 | " * Blocking opponent moves (might help winning chances many moves from now)\n" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": { 336 | "slideshow": { 337 | "slide_type": "slide" 338 | } 339 | }, 340 | "source": [ 341 | "## Interaction between Agent and Environment\n", 342 | "
\"RewardHypothesis\"
" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": { 348 | "slideshow": { 349 | "slide_type": "subslide" 350 | } 351 | }, 352 | "source": [ 353 | "## Interaction between Agent and Environment\n", 354 | "
\"RewardHypothesis\"
" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": { 360 | "slideshow": { 361 | "slide_type": "subslide" 362 | } 363 | }, 364 | "source": [ 365 | "## History and State\n", 366 | "
\"HistoryandState\"
" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "slideshow": { 373 | "slide_type": "subslide" 374 | } 375 | }, 376 | "source": [ 377 | "## Environment State\n", 378 | "
\"HistoryandState\"
" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": { 384 | "slideshow": { 385 | "slide_type": "subslide" 386 | } 387 | }, 388 | "source": [ 389 | "## Agent State\n", 390 | "
\"HistoryandState\"
" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": { 396 | "slideshow": { 397 | "slide_type": "subslide" 398 | } 399 | }, 400 | "source": [ 401 | "## Information State\n", 402 | "
\"HistoryandState\"
" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "slideshow": { 409 | "slide_type": "subslide" 410 | } 411 | }, 412 | "source": [ 413 | "## Fully Observable Environment\n", 414 | "
\"HistoryandState\"
" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": { 420 | "slideshow": { 421 | "slide_type": "subslide" 422 | } 423 | }, 424 | "source": [ 425 | "## Partially Observable Environment\n", 426 | "
\"HistoryandState\"
" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": { 432 | "slideshow": { 433 | "slide_type": "slide" 434 | } 435 | }, 436 | "source": [ 437 | "## Major Components of a RL Agent\n", 438 | "An RL agent may include one or more of these components:\n", 439 | "* **Policy**: agent’s behaviour function\n", 440 | "* **Value function**: how good is each state and/or action\n", 441 | "* **Model**: agent’s representation of the environment" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": { 447 | "slideshow": { 448 | "slide_type": "subslide" 449 | } 450 | }, 451 | "source": [ 452 | "## Policy\n", 453 | "* A policy is the agent’s behaviour\n", 454 | "* It is a map from state to action, e.g.\n", 455 | "* Deterministic policy: $a = π(s)$\n", 456 | "* Stochastic policy: $π(a|s) = P[A_t = a|S_t = s]$" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": { 462 | "slideshow": { 463 | "slide_type": "subslide" 464 | } 465 | }, 466 | "source": [ 467 | "## Value function\n", 468 | "* Value function is a prediction of future reward\n", 469 | "* Used to evaluate the goodness/badness of states\n", 470 | "* And therefore to select between actions, e.g.\n", 471 | "$$v_π(s) = E_π[R _{t+1} + γ*R_{t+2} + γ^{2}*R_{t+3} + ... | S_t = s]$$" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": { 477 | "slideshow": { 478 | "slide_type": "subslide" 479 | } 480 | }, 481 | "source": [ 482 | "## Model\n", 483 | "* A model predicts what the environment will do next\n", 484 | "* **Transitions**: P predicts the next state (i.e. dynamics)\n", 485 | "* **Rewards**: R predicts the next (immediate) reward, e.g.\n", 486 | "$$ P_{ss'} = P[S_{t+1} = s' | S_t = s, A_t = a]$$\n", 487 | "$$ R^{a}_{s} = E[R_{t+1} | S_t = s, A_t = a]$$" 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": { 493 | "slideshow": { 494 | "slide_type": "subslide" 495 | } 496 | }, 497 | "source": [ 498 | "## Maze Example\n", 499 | "
\"HistoryandState\"
\n" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": { 505 | "slideshow": { 506 | "slide_type": "subslide" 507 | } 508 | }, 509 | "source": [ 510 | "# Maze Example: Policy\n", 511 | "
\"HistoryandState\"
\n", 512 | "\n", 513 | "* Arrows represent policy $\\pi(s)$ for each state s. " 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": { 519 | "slideshow": { 520 | "slide_type": "subslide" 521 | } 522 | }, 523 | "source": [ 524 | "# Maze Example: Value Function\n", 525 | "
\"HistoryandState\"
" 526 | ] 527 | }, 528 | { 529 | "cell_type": "markdown", 530 | "metadata": { 531 | "slideshow": { 532 | "slide_type": "subslide" 533 | } 534 | }, 535 | "source": [ 536 | "# Maze Example: Model\n", 537 | "
\"HistoryandState\"
" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": { 543 | "slideshow": { 544 | "slide_type": "slide" 545 | } 546 | }, 547 | "source": [ 548 | "## Learning and Planning\n", 549 | "Two fundamental problems in sequential decision making:\n", 550 | "* Reinforcement Learning:\n", 551 | " * The environment is initially unknown\n", 552 | " * The agent interacts with the environment\n", 553 | " * The agent improves its policy\n", 554 | "* Planning:\n", 555 | " * A model of the environment is known\n", 556 | " * The agent performs computations with its model (without any external interaction)\n", 557 | " * The agent improves its policy a.k.a. deliberation, reasoning, introspection, pondering, thought, search" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": { 563 | "slideshow": { 564 | "slide_type": "slide" 565 | } 566 | }, 567 | "source": [ 568 | "## Exploration and Exploitation\n", 569 | "\n", 570 | "* Reinforcement learning is like trial-and-error learning\n", 571 | "\n", 572 | "\n", 573 | "* The agent should discover a good policy,\n", 574 | " * From its experiences of the environment,\n", 575 | " * Without losing too much reward along the way\n", 576 | "\n", 577 | "\n", 578 | "* **Exploration** finds more information about the environment\n", 579 | "\n", 580 | "\n", 581 | "* **Exploitation** exploits known information to maximise reward\n", 582 | "\n", 583 | "\n", 584 | "* It is usually important to explore as well as exploit (In Detail => Chapter 2)." 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": { 590 | "slideshow": { 591 | "slide_type": "subslide" 592 | } 593 | }, 594 | "source": [ 595 | "## Examples of Explortion and Exploitation\n", 596 | "\n", 597 | "* Restaurant Selection\n", 598 | " * Exploitation Go to your favourite restaurant\n", 599 | " * Exploration Try a new restaurant\n", 600 | "* Online Banner Advertisements\n", 601 | " * Exploitation Show the most successful advert\n", 602 | " * Exploration Show a different advert\n", 603 | "* Oil Drilling\n", 604 | " * Exploitation Drill at the best known location\n", 605 | " * Exploration Drill at a new location\n", 606 | "* Game Playing\n", 607 | " * Exploitation Play the move you believe is best\n", 608 | " * Exploration Play an experimental move" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "metadata": { 614 | "slideshow": { 615 | "slide_type": "slide" 616 | } 617 | }, 618 | "source": [ 619 | "## Summary\n", 620 | "\n", 621 | "* We got introduced to the basic terminologies of RL.\n", 622 | "\n", 623 | "\n", 624 | "* We got an intuition behind how an RL agent can solve problems.\n", 625 | "\n" 626 | ] 627 | } 628 | ], 629 | "metadata": { 630 | "celltoolbar": "Slideshow", 631 | "kernelspec": { 632 | "display_name": "Python 2", 633 | "language": "python", 634 | "name": "python2" 635 | }, 636 | "language_info": { 637 | "codemirror_mode": { 638 | "name": "ipython", 639 | "version": 2 640 | }, 641 | "file_extension": ".py", 642 | "mimetype": "text/x-python", 643 | "name": "python", 644 | "nbconvert_exporter": "python", 645 | "pygments_lexer": "ipython2", 646 | "version": "2.7.12" 647 | } 648 | }, 649 | "nbformat": 4, 650 | "nbformat_minor": 2 651 | } 652 | -------------------------------------------------------------------------------- /ch_1_rl_intro/img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/1.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/10.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/11.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/12.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/13.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/2.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/3.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/6.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/7.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/8.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/9.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/e1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/e1.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/e2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/e2.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/e3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/e3.png -------------------------------------------------------------------------------- /ch_1_rl_intro/img/e4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_1_rl_intro/img/e4.png -------------------------------------------------------------------------------- /ch_1_rl_intro/readme.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Reinforcement Learning" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Reference: Chapter 1, Sutton and Barto" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Contents:" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "1) **RL: Formal Definition**\n", 29 | "* Recent Examples\n", 30 | "\n", 31 | "2) **RL vs Supervised Learning vs Unsupervised learning**\n", 32 | "\n", 33 | "3) **Important RL Perspectives**\n", 34 | "* Goal (Reward Hypothesis)\n", 35 | "* Sequential Decision Making Problem\n", 36 | "* Interaction between Agent and Environment\n", 37 | "\n", 38 | "4) **Components of RL Agent**\n", 39 | "* Policy\n", 40 | "* Value Function\n", 41 | "* Model\n", 42 | "\n", 43 | "5) **RL Problems: Learning and Planning**\n", 44 | "\n", 45 | "6) **Exploration vs Exploitation**\n", 46 | "\n", 47 | "7) **Prediction and Control**\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Summary" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "* We got introduced to the basic terminologies of RL.\n", 62 | "\n", 63 | "\n", 64 | "* We saw how Reinforcement learning is different from other forms of learning.\n", 65 | "\n" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 2", 72 | "language": "python", 73 | "name": "python2" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 2 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython2", 85 | "version": "2.7.12" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /ch_1_rl_intro/readme.md: -------------------------------------------------------------------------------- 1 | 2 | # Introduction to Reinforcement Learning 3 | 4 | ### Reference: Chapter 1, Sutton and Barto 5 | 6 | ## Contents: 7 | 8 | 1) **RL: Formal Definition** 9 | * Recent Examples 10 | 11 | 2) **RL vs Supervised Learning vs Unsupervised learning** 12 | 13 | 3) **Important RL Perspectives** 14 | * Goal (Reward Hypothesis) 15 | * Sequential Decision Making Problem 16 | * Interaction between Agent and Environment 17 | 18 | 4) **Components of RL Agent** 19 | * Policy 20 | * Value Function 21 | * Model 22 | 23 | 5) **RL Problems: Learning and Planning** 24 | 25 | 6) **Exploration vs Exploitation** 26 | 27 | 7) **Prediction and Control** 28 | 29 | -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/UCB.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/UCB.JPG -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/com_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/com_2.jpg -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/grad.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/grad.jpg -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/greedyvs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/greedyvs.jpg -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/mistake.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/mistake.JPG -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/mistake_1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/mistake_1.JPG -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/mistake_2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/mistake_2.JPG -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/multiarmedbandit.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/multiarmedbandit.jpg -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/optinit.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/optinit.JPG -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/img/testbed.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_2_rl_in_non_associative/img/testbed.JPG -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/readme.md: -------------------------------------------------------------------------------- 1 | #
Reinforcement Learning in Non-Associative setting
2 | 3 | ####
Reference: Chapter 2, Sutton and Barto
4 | 5 | ## Contents: 6 | 7 | 1) **Introduction** 8 | * Non-Associative Setting? 9 | * Examples? 10 | 11 | 2) **Multi-arm Bandit Problems** 12 | * k-armed bandit problem 13 | * expected reward 14 | * exploration vs exploitation 15 | 16 | 3) **Action Value Methods** 17 | * Sample Average method 18 | * greedy and $\epsilon$ greedy 19 | * The test-bed 20 | * Non-stationary problems 21 | 22 | 4) **Improving Exploration in Simple Bandit Problem** 23 | * Optimistic Initial Values 24 | * Upper-Confidence Bound Action Selection 25 | * Gradient Bandit Algorithm 26 | -------------------------------------------------------------------------------- /ch_2_rl_in_non_associative/references.md: -------------------------------------------------------------------------------- 1 | # Bibliography 2 | 3 | 1) Non-associative Learning: [wiki](https://www.wikiwand.com/en/Learning),[better than wiki](https://www.britannica.com/topic/animal-learning/Types-of-learning#ref320590). 4 | 5 | 2) Multi-arm Bandit Problem: [wiki](https://www.wikiwand.com/en/Multi-armed_bandit),[better than wiki](http://blog.thedataincubator.com/2016/07/multi-armed-bandits-2/). 6 | 7 | 3) Action Value methods:[Paper Comparing various methods(also evolutionary solutions)](https://link.springer.com/article/10.1007/s12351-008-0007-5). 8 | 9 | 4) Exploration vs exploitation:[David Silver's Slides(Includes formuation of regret)](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/XX.pdf) 10 | 11 | 5) More Bandit Algorithms:[Book/wesite by Csaba Szepesvari](http://banditalgs.com/) 12 | 13 | 6) Real life examples of MAB: [practo](https://www.youtube.com/watch?v=B1l_juzrw7Q),[Google Analytics A/B testing](https://support.google.com/analytics/answer/2844870?hl=en) 14 | 15 | 16 | ### Globally valuable content: 17 | 18 | 1) [Princeton Slides on Exploration & Exploitation in Reinforcement Learning](https://www.cs.princeton.edu/courses/archive/fall16/cos402/lectures/402-lec22.pdf). 19 | 20 | 2) [Michael Herrmann's Slides on Multi-arm Bandits(University of Edinburgh)](http://www.inf.ed.ac.uk/teaching/courses/rl/slides15/rl02.pdf). 21 | 22 | 3) [David Silver's Slides on Exploration and Exploitation](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/XX.pdf). 23 | 24 | 4) [The non-stochastic multi-armed bandit problem-Peter Auer](https://cseweb.ucsd.edu/~yfreund/papers/bandits.pdf). 25 | 26 | 5) [Chapter-2 of Reinforcement Learning, An Introduction](ufal.mff.cuni.cz/~straka/courses/npfl114/2016/sutton-bookdraft2016sep.pdf). -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/.ipynb_checkpoints/RL_3 - Copy-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "#
Finite Markov Decision Processes
\n", 12 | "\n", 13 | "###
Reference: Chapter 3, Sutton and Barto
" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "slideshow": { 20 | "slide_type": "slide" 21 | } 22 | }, 23 | "source": [ 24 | "#
Contents
\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "fragment" 32 | } 33 | }, 34 | "source": [ 35 | "* Why MDPs?" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "slideshow": { 42 | "slide_type": "fragment" 43 | } 44 | }, 45 | "source": [ 46 | "* Markov Property" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "slideshow": { 53 | "slide_type": "fragment" 54 | } 55 | }, 56 | "source": [ 57 | "* Building Blocks of MDP\n", 58 | " * Episodic vs Continuous Tasks\n", 59 | " * State Transition Matrix\n", 60 | " * Return\n", 61 | " * Discount\n", 62 | " * Value Function\n", 63 | " " 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "slideshow": { 70 | "slide_type": "fragment" 71 | } 72 | }, 73 | "source": [ 74 | "* MDP Parameters\n", 75 | " * Policy in MDP notations\n", 76 | " * Value Functions in MDP notations" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "slideshow": { 83 | "slide_type": "fragment" 84 | } 85 | }, 86 | "source": [ 87 | "* Bellman Expectation Equations" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "slideshow": { 94 | "slide_type": "fragment" 95 | } 96 | }, 97 | "source": [ 98 | "* Bellman Optimal Equations\n" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "slideshow": { 110 | "slide_type": "slide" 111 | } 112 | }, 113 | "source": [ 114 | "##
The Agent Environment Interface
\n", 115 | "\n", 116 | "\n", 117 | "
\"MarkovProperty
" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "slideshow": { 124 | "slide_type": "subslide" 125 | } 126 | }, 127 | "source": [ 128 | "* Markov decision processes formally **describe an environment** for reinforcement learning\n", 129 | "* Where the environment is **fully observable**\n", 130 | "* i.e. The **current state** completely characterises the process\n", 131 | "* Almost all RL problems can be formalised as MDPs, e.g.\n", 132 | " * Optimal control primarily deals with continuous MDPs\n", 133 | " * Partially observable problems can be converted into MDPs\n", 134 | " * Bandits are MDPs with one state" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "slideshow": { 141 | "slide_type": "slide" 142 | } 143 | }, 144 | "source": [ 145 | "#
Markov Property
" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "slideshow": { 152 | "slide_type": "subslide" 153 | } 154 | }, 155 | "source": [ 156 | "“The future is independent of the past given the present”\n", 157 | "
\"MarkovProperty
\n", 158 | "\n", 159 | "* The state captures all relevant information from the history\n", 160 | "* Once the state is known, the history may be thrown away\n", 161 | "* i.e. The state is a sufficient statistic of the future" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": { 167 | "slideshow": { 168 | "slide_type": "slide" 169 | } 170 | }, 171 | "source": [ 172 | "#
Building Blocks of MDP
" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "slideshow": { 179 | "slide_type": "subslide" 180 | } 181 | }, 182 | "source": [ 183 | "## Episodic vs Continuing Tasks\n", 184 | "\n", 185 | "### Episodic Tasks\n", 186 | "* Each episode ends in a special state called the terminal state, \n", 187 | "* Followed by a reset to a standard starting state or to a sample from a standard distribution of starting states. \n", 188 | "\n", 189 | "### Continuing Tasks\n", 190 | "\n", 191 | "* The agent–environment interaction does not break naturally into identifiable episodes.\n", 192 | "* It goes on continually without limit. " 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": { 198 | "slideshow": { 199 | "slide_type": "subslide" 200 | } 201 | }, 202 | "source": [ 203 | "## Unified Notation for Episodic and Continuous Tasks\n", 204 | "\n", 205 | "\n", 206 | "\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "slideshow": { 213 | "slide_type": "fragment" 214 | } 215 | }, 216 | "source": [ 217 | "#### Return for Episodic Tasks\n", 218 | "sum over a finite number of terms" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "slideshow": { 225 | "slide_type": "fragment" 226 | } 227 | }, 228 | "source": [ 229 | "#### Return for Continuous Tasks \n", 230 | "sum over an infinite number of terms" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": { 236 | "slideshow": { 237 | "slide_type": "fragment" 238 | } 239 | }, 240 | "source": [ 241 | "We need one convention to obtain a single notation that covers both episodic and continuing tasks.\n", 242 | "\n", 243 | "How to do that?" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": { 249 | "slideshow": { 250 | "slide_type": "subslide" 251 | } 252 | }, 253 | "source": [ 254 | "These can be unified by considering episode termination to be the entering\n", 255 | "of a **special absorbing state** that **transitions only to itself** and that **generates only\n", 256 | "rewards of zero**. For example, consider the state transition diagram -\n", 257 | "
\"Matrix\"
\n", 258 | "Hence, return can be written as-\n", 259 | "
\"Matrix\"
" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": { 265 | "slideshow": { 266 | "slide_type": "subslide" 267 | } 268 | }, 269 | "source": [ 270 | "## State Transition Matrix\n", 271 | "
\"Matrix\"
" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": { 277 | "slideshow": { 278 | "slide_type": "subslide" 279 | } 280 | }, 281 | "source": [ 282 | "## Return\n", 283 | "
\"Matrix\"
\n", 284 | "* The discount $γ ∈ [0, 1]$ is the present value of future rewards\n", 285 | "* The value of receiving reward R after k + 1 time-steps is $γ^k R$.\n", 286 | "* This values immediate reward above delayed reward.\n", 287 | " * $γ$ close to 0 leads to ”myopic” evaluation\n", 288 | " * $γ$ close to 1 leads to ”far-sighted” evaluation" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": { 294 | "slideshow": { 295 | "slide_type": "subslide" 296 | } 297 | }, 298 | "source": [ 299 | "## Discount \n", 300 | "\n", 301 | "Most Markov reward and decision processes are discounted. Why?\n", 302 | "* Mathematically convenient to discount rewards\n", 303 | "* Avoids infinite returns in cyclic Markov processes\n", 304 | "* Uncertainty about the future may not be fully represented\n", 305 | "* If the reward is financial, immediate rewards may earn more interest than delayed rewards\n", 306 | "* Animal/human behaviour shows preference for immediate reward\n", 307 | "* It is sometimes possible to use undiscounted Markov reward processes (i.e. $γ = 1$), e.g. if all sequences terminate." 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": { 313 | "slideshow": { 314 | "slide_type": "subslide" 315 | } 316 | }, 317 | "source": [ 318 | "## Value Function\n", 319 | "The value function $v(s)$ gives the long-term value of state s\n", 320 | "
\"Matrix\"
\n" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": { 326 | "slideshow": { 327 | "slide_type": "slide" 328 | } 329 | }, 330 | "source": [ 331 | "#
MDP Parameters
" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": { 337 | "slideshow": { 338 | "slide_type": "subslide" 339 | } 340 | }, 341 | "source": [ 342 | "A Markov decision process (MDP) is a Markov reward process with decisions. It is an environment in which all states are Markov.\n", 343 | "\n", 344 | "
\"Matrix\"
" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "slideshow": { 351 | "slide_type": "subslide" 352 | } 353 | }, 354 | "source": [ 355 | "## Policy in MDP notation\n", 356 | "
\"Matrix\"
\n", 357 | "* A policy fully defines the behaviour of an agent\n", 358 | "* MDP policies depend on the current state (not the history)\n", 359 | "* i.e. Policies are **stationary** (time-independent),\n", 360 | " $A_t ∼ π(·|S_t ), \\forall t > 0$\n", 361 | " " 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "slideshow": { 368 | "slide_type": "subslide" 369 | } 370 | }, 371 | "source": [ 372 | "## Policy in MDP notation\n", 373 | "Given a MDP $M = \\left \\langle S, A, P, R, \\gamma \\right \\rangle$ and a policy $\\pi$\n", 374 | "\n", 375 | "$$P_{s,s'}^{\\pi} = \\sum_{a \\epsilon A} \\pi(a|s) P_{ss'}^{a}$$\n", 376 | "$$R_{s}^{\\pi} = \\sum_{a \\epsilon A} \\pi(a|s) R_{s}^{a}$$" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": { 382 | "slideshow": { 383 | "slide_type": "subslide" 384 | } 385 | }, 386 | "source": [ 387 | "## Example: Recycling Robot\n", 388 | "\"Matrix\"\n", 389 | "** Task: ** Collect Empty soda cans in office\n", 390 | " \n", 391 | "** Sensors: **\n", 392 | " \n", 393 | " 1) Detector : For detecting cans\n", 394 | " 2) Arm + Gripper : To pick up and place can in onboard bin\n", 395 | " \n", 396 | "** Actions: **\n", 397 | " \n", 398 | " 1) {Search} - Actively search for a can\n", 399 | " 2) {Wait} - Remain stationary and wait for someone to bring a can. (Will lose less battery)\n", 400 | " 3) {Recharge} - Head back home for recharging\n", 401 | " \n", 402 | "** States: **\n", 403 | " \n", 404 | " 1) high - Battery is charged considerably well\n", 405 | " 2) low - Battery is not charged\n", 406 | " \n", 407 | "** Rewards: ** \n", 408 | " \n", 409 | " 1) zero most of the time\n", 410 | " 2) become positive when the robot secures an empty can, \n", 411 | " 3) negative if the battery runs all the way down\n", 412 | " \n", 413 | "**
How can we formulate this as a MDP?
**" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": { 419 | "slideshow": { 420 | "slide_type": "subslide" 421 | } 422 | }, 423 | "source": [ 424 | "## Value Function in MDP notation\n", 425 | "
\"Matrix\"
\n" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": { 431 | "slideshow": { 432 | "slide_type": "slide" 433 | } 434 | }, 435 | "source": [ 436 | "#
Bellman Expectation Equation
" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": { 442 | "slideshow": { 443 | "slide_type": "subslide" 444 | } 445 | }, 446 | "source": [ 447 | "## Bellman Expectation Equation\n", 448 | "
\"Matrix\"
" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": { 454 | "slideshow": { 455 | "slide_type": "subslide" 456 | } 457 | }, 458 | "source": [ 459 | "## Bellman Expectation Equation for $V^\\pi$\n", 460 | "
\"Matrix\"
" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": { 466 | "slideshow": { 467 | "slide_type": "subslide" 468 | } 469 | }, 470 | "source": [ 471 | "## Bellman Expectation Equation for $Q^\\pi$\n", 472 | "
\"Matrix\"
" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": { 478 | "slideshow": { 479 | "slide_type": "subslide" 480 | } 481 | }, 482 | "source": [ 483 | "## Bellman Expectation Equation for $v_\\pi$\n", 484 | "
\"Matrix\"
" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": { 490 | "slideshow": { 491 | "slide_type": "subslide" 492 | } 493 | }, 494 | "source": [ 495 | "## Bellman Expectation Equation for $q_\\pi$\n", 496 | "
\"Matrix\"
" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": { 502 | "slideshow": { 503 | "slide_type": "slide" 504 | } 505 | }, 506 | "source": [ 507 | "#
Bellman Optimality Equation
" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": { 513 | "slideshow": { 514 | "slide_type": "subslide" 515 | } 516 | }, 517 | "source": [ 518 | "## Optimal Value Function\n", 519 | "
\"Matrix\"
" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": { 525 | "slideshow": { 526 | "slide_type": "subslide" 527 | } 528 | }, 529 | "source": [ 530 | "## Optimal Policy\n", 531 | "
\"Matrix\"
" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": { 537 | "slideshow": { 538 | "slide_type": "subslide" 539 | } 540 | }, 541 | "source": [ 542 | "## Finding an Optimal Policy\n", 543 | "
\"Matrix\"
" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": { 549 | "slideshow": { 550 | "slide_type": "subslide" 551 | } 552 | }, 553 | "source": [ 554 | "## Bellman Optimality Equation for $v_{*}$\n", 555 | "
\"Matrix\"
" 556 | ] 557 | }, 558 | { 559 | "cell_type": "markdown", 560 | "metadata": { 561 | "slideshow": { 562 | "slide_type": "subslide" 563 | } 564 | }, 565 | "source": [ 566 | "## Bellman Optimality Equation for $Q_{*}$\n", 567 | "
\"Matrix\"
" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": { 573 | "slideshow": { 574 | "slide_type": "subslide" 575 | } 576 | }, 577 | "source": [ 578 | "## Bellman Optimality Equatin for $V^{*}$\n", 579 | "
\"Matrix\"
" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": { 585 | "slideshow": { 586 | "slide_type": "subslide" 587 | } 588 | }, 589 | "source": [ 590 | "## Bellman Optimality Equation for $Q^{*}$\n", 591 | "
\"Matrix\"
" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": { 597 | "slideshow": { 598 | "slide_type": "slide" 599 | } 600 | }, 601 | "source": [ 602 | "## Summary\n", 603 | "* We looked into the MDP formulation of a RL problem.\n", 604 | "* We looked into the formulation of Value functions.\n", 605 | " * action-value pairs\n", 606 | " * state-action pairs\n", 607 | "* Understood the motivation and necessity of Bellman Expectation Equations and Bellman Optimlality Equations" 608 | ] 609 | } 610 | ], 611 | "metadata": { 612 | "celltoolbar": "Slideshow", 613 | "kernelspec": { 614 | "display_name": "Python [conda root]", 615 | "language": "python", 616 | "name": "conda-root-py" 617 | }, 618 | "language_info": { 619 | "codemirror_mode": { 620 | "name": "ipython", 621 | "version": 3 622 | }, 623 | "file_extension": ".py", 624 | "mimetype": "text/x-python", 625 | "name": "python", 626 | "nbconvert_exporter": "python", 627 | "pygments_lexer": "ipython3", 628 | "version": "3.5.2" 629 | }, 630 | "widgets": { 631 | "state": {}, 632 | "version": "1.1.2" 633 | } 634 | }, 635 | "nbformat": 4, 636 | "nbformat_minor": 2 637 | } 638 | -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/1.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/2.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/3.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/4.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/5.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/6.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/61.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/61.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/7.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/agent_env.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/agent_env.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/b1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b1.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/b2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b2.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/b3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b3.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/b4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b4.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/b5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/b5.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/o.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/o.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/o1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/o1.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/o2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/o2.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/op1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/op1.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/op2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/op2.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/op3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/op3.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/op4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/op4.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/pic1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/pic1.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/pic2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/pic2.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/return.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/return.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/robot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/robot.jpg -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_1.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_10.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_10.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_2.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_3.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_4.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_5.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_6.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_6.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_7.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_7.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_8.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_8.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/slides_9.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/slides_9.PNG -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/img/unified.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_3_rl_finite_mdp/img/unified.png -------------------------------------------------------------------------------- /ch_3_rl_finite_mdp/readme.md: -------------------------------------------------------------------------------- 1 | 2 | # Dynamic programming Methods 3 | 4 | ### Reference: Chapter 3, Sutton and Barto 5 | 6 | ## Contents: 7 | 8 | 1) **Introduction** 9 | 10 | 2) **Building Blocks of MDP** 11 | * Markov Property 12 | * State Transition Matrix 13 | * Return 14 | * Discount 15 | * Value Function 16 | 17 | 3) **MDP Parameters** 18 | * Policy in MDP Notations 19 | * Value Functions in MDP notations 20 | 21 | 4) **Bellman Expectation Equations** 22 | 23 | 5) **Bellman Optimality Equations** 24 | -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/.ipynb_checkpoints/readme-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/a.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/aaa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/aaa.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/aaaa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/aaaa.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/async_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_1.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/async_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_2.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/async_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_3.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/async_ex_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_ex_1.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/async_ex_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_ex_2.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/async_ex_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_ex_3.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/async_ex_4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/async_ex_4.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/b.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/b1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/b1.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/b2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/b2.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/contr_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/contr_1.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/contr_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/contr_2.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/contr_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/contr_3.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/d.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/dp_ex_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/dp_ex_1.PNG -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/e.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/e1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/e1.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/e2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/e2.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/e3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/e3.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/p1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/p1.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/p2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/p2.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/s1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/s1.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/s11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/s11.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/s2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/s2.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/s3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/s3.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/sa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/sa.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/sb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/sb.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v1.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v2.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v3.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/v4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v4.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/img/v5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_4_rl_dynamic_programming/img/v5.png -------------------------------------------------------------------------------- /ch_4_rl_dynamic_programming/readme.md: -------------------------------------------------------------------------------- 1 | 2 | # Dynamic programming Methods 3 | 4 | ### Reference: Chapter 4, Sutton and Barto 5 | 6 | ## Contents: 7 | 8 | 1) **Recap:What is Dynamic Programming?** 9 | 10 | 2) **Planning by DP in MDP** 11 | * Iterative Policy Evaluation 12 | * Policy Improvement 13 | 14 | 3) **Example: Gridworld (Policy Evaluation and Policy Improvment)** 15 | 16 | 4) **Control** 17 | * Policy Iteration 18 | * Value Iteration 19 | 20 | 5) **Synchronous/Asynchronous Dynamic Programming Algorithms** 21 | 22 | 6) **Full-Width Backups/Sample Backups** 23 | * In-place Dynamic Programming 24 | * Prioritised Sweeping 25 | * Real-Time Dynamic Programming 26 | 27 | -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/.ipynb_checkpoints/readme-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/1.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/2.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/3.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/4.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/5.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/6.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/a1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a1.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/a2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a2.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/a3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a3.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/a4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a4.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/a5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a5.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/a6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a6.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/a7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/a7.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/c1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c1.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/c2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c2.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/c3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c3.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/c4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c4.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/c5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c5.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/c6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c6.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/c7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/c7.png -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/imp_sam_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_1.PNG -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/imp_sam_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_2.PNG -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/imp_sam_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_3.PNG -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/imp_sam_4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_4.PNG -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/img/imp_sam_5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_5_rl_mc_methods/img/imp_sam_5.PNG -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/readme.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Monte-Carlo Methods\n", 8 | "\n", 9 | "### Reference: Chapter 5, Sutton and Barto\n", 10 | "\n", 11 | "## Contents:\n", 12 | "\n", 13 | "1) **Monte Carlo Prediction**\n", 14 | "\n", 15 | "2) **Monte Carlo Estimation of Action Values**\n", 16 | "\n", 17 | "3) **Monte Carlo Control**\n", 18 | "\n", 19 | "4) **Monte Carlo Control without Exploring Starts**\n", 20 | "\n", 21 | "5) **Off-policy Prediction via Importance Sampling**\n", 22 | "\n", 23 | "6) **Incremental Implementation**\n", 24 | "\n", 25 | "7) **Off-Policy Monte Carlo Control**\n", 26 | "\n", 27 | "8) **Return-Specific Importance Sampling**\n", 28 | "\n", 29 | "\n", 30 | "\n", 31 | "## Summary\n", 32 | "\n", 33 | "* Monte Carlo Approach\n", 34 | "\n", 35 | "* On-Policy Prediction\n", 36 | "\n", 37 | "* On-Policy Control (w/o Assumption: Exploring Starts)\n", 38 | "\n", 39 | "* Off-Policy Prediction\n", 40 | "\n", 41 | "* Off-Policy Control\n" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "Python 2", 48 | "language": "python", 49 | "name": "python2" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 2 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython2", 61 | "version": "2.7.12" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 2 66 | } 67 | -------------------------------------------------------------------------------- /ch_5_rl_mc_methods/readme.md: -------------------------------------------------------------------------------- 1 | 2 | # Monte-Carlo Methods 3 | 4 | ### Reference: Chapter 5, Sutton and Barto 5 | 6 | ## Contents: 7 | 8 | 1) **Introduction** 9 | * Why move to MCMC from DP? 10 | 11 | 2) **Monte Carlo Prediction (On-Policy)** 12 | * Problem Definition 13 | * First-Visit Monte Carlo Policy Evaluation 14 | * Every-Visit Monte Carlo Evaluation 15 | * Non-Stationary Evaluation 16 | * Some Important Facts 17 | 18 | 3) **Monte Carlo Estimation of Action Values** 19 | * Why compute stat-action q(s,a) values, instead of v(s)? 20 | 21 | 4) **Monte Carlo Control (On-Policy)** 22 | * Building up on Generalised Policy Iteration 23 | * Issues with the method 24 | * Assuming Infinite Number of Episodes 25 | * Problem of "maintaining exploration" 26 | * Dealing with the Issues 27 | 28 | 5) **On-Policy vs Off-Policy Methods** 29 | 30 | 6) **Off-Policy MC Predction** 31 | * Problem Definition 32 | * Importance Sampling 33 | 34 | 7) **Off-Policy MC Control** 35 | 36 | 7) **Off-Policy Monte Carlo Control** 37 | 38 | 8) **Return-Specific Importance Sampling** 39 | -------------------------------------------------------------------------------- /ch_6_td_methods/img/10_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/10_2.png -------------------------------------------------------------------------------- /ch_6_td_methods/img/DPback.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/DPback.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/MCback.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/MCback.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/TDback.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/TDback.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/TDex1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/TDex1.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/backup_q.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/backup_q.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/batch_td.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/batch_td.PNG -------------------------------------------------------------------------------- /ch_6_td_methods/img/bootsam.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/bootsam.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/cliff.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/cliff.jpg -------------------------------------------------------------------------------- /ch_6_td_methods/img/doubleq.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/doubleq.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/ex_sarsa.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/ex_sarsa.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/maxbias.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/maxbias.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/mcvstd.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/mcvstd.PNG -------------------------------------------------------------------------------- /ch_6_td_methods/img/mcvstd_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/mcvstd_2.PNG -------------------------------------------------------------------------------- /ch_6_td_methods/img/qvssarsa.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/qvssarsa.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/tdex2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/tdex2.JPG -------------------------------------------------------------------------------- /ch_6_td_methods/img/tdmarkov.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_6_td_methods/img/tdmarkov.jpg -------------------------------------------------------------------------------- /ch_6_td_methods/readme.md: -------------------------------------------------------------------------------- 1 | #
Temporal Difference Methods
2 | 3 | ####
Reference: Chapter 6, Sutton and Barto
4 | 5 | ## Contents: 6 | 7 | 1) **Introduction to TD methods** 8 | 9 | 2) **Properties of Temporal Difference Methods** 10 | * TD Update 11 | * TD Error 12 | * MC vs TD 13 | * Examples 14 | 15 | 3) **One-step, tabular, model-free TD methods** 16 | * SARSA 17 | * Q-Learning 18 | * Expected SARSA 19 | * Double Q-Learning 20 | -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/.ipynb_checkpoints/RL_7-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "#
Unifying MC Methods and TD Methods
\n", 12 | "##
Bootstrapping, TD($\\lambda$) and Eligibility Traces
\n", 13 | "###
Reference: Chapter 7 and Chapter 12, Sutton and Barto
" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "slideshow": { 20 | "slide_type": "slide" 21 | } 22 | }, 23 | "source": [ 24 | "## Contents\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "fragment" 32 | } 33 | }, 34 | "source": [ 35 | "* Bootstrapping" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "slideshow": { 42 | "slide_type": "fragment" 43 | } 44 | }, 45 | "source": [ 46 | "* TD ($\\lambda$)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "slideshow": { 53 | "slide_type": "fragment" 54 | } 55 | }, 56 | "source": [ 57 | "* Eligibility Traces" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "slideshow": { 64 | "slide_type": "slide" 65 | } 66 | }, 67 | "source": [ 68 | "##
Bootstrapping
" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "slideshow": { 75 | "slide_type": "subslide" 76 | } 77 | }, 78 | "source": [ 79 | "## Bootstrapping and Sampling\n", 80 | "
\"RewardHypothesis\"
" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "slideshow": { 87 | "slide_type": "subslide" 88 | } 89 | }, 90 | "source": [ 91 | "## Why do Bootstrapping?\n", 92 | "\n", 93 | "* Free from tyranny of the time step\n", 94 | "* Sometimes updates are required at every step of transition (one-step TD)\n", 95 | " * Take into account every possible transition/ anything that has changed\n", 96 | "* Sometimes, it makes sense to only update every few stansitions (multi-step TD)\n", 97 | " * Take into account significant/considerable changes" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "slideshow": { 104 | "slide_type": "subslide" 105 | } 106 | }, 107 | "source": [ 108 | "## n-Step Prediction\n", 109 | "
\"RewardHypothesis\"
" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "slideshow": { 116 | "slide_type": "subslide" 117 | } 118 | }, 119 | "source": [ 120 | "## n-Step Return\n", 121 | "
\"RewardHypothesis\"
" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "slideshow": { 128 | "slide_type": "subslide" 129 | } 130 | }, 131 | "source": [ 132 | "## n-Step Prediction\n", 133 | "\n", 134 | "
\"RewardHypothesis\"
" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "slideshow": { 141 | "slide_type": "subslide" 142 | } 143 | }, 144 | "source": [ 145 | "## n-Step SARSA (On-Policy Control)\n", 146 | "\n", 147 | "* simply switch states for actions (state–action pairs) and then use an $\\epsilon$-greedy policy.\n", 148 | "* The n-step returns in terms of estimated action values:\n", 149 | "
\"RewardHypothesis\"
\n", 150 | "* Update made to a particular value of action-pair is as follows:\n", 151 | "
\"RewardHypothesis\"
\n" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": { 157 | "slideshow": { 158 | "slide_type": "subslide" 159 | } 160 | }, 161 | "source": [ 162 | "## n-Step SARSA (On-policy Control)\n", 163 | "\n", 164 | "
\"RewardHypothesis\"
" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "slideshow": { 171 | "slide_type": "subslide" 172 | } 173 | }, 174 | "source": [ 175 | "# n-Step SARSA Example\n", 176 | "\n", 177 | "
\"RewardHypothesis\"
" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "slideshow": { 184 | "slide_type": "skip" 185 | } 186 | }, 187 | "source": [ 188 | "## n-Step SARSA (On-policy Control)\n", 189 | "\n", 190 | "
\"RewardHypothesis\"
" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "slideshow": { 197 | "slide_type": "subslide" 198 | } 199 | }, 200 | "source": [ 201 | "## n-Step Off-Policy Control (with Importance Sampling)\n", 202 | "\n", 203 | "* learning the value function for one policy, π, while following another policy, μ\n", 204 | "* Often, π is the greedy policy for the current action-value-function estimate, and μ is a more exploratory policy, perhaps ε-greedy\n", 205 | "* we must take into account the difference between the two policies, using their relative probability of taking the actions that were taken\n", 206 | "* To measure this difference, we use the importance sampling ratio. \n", 207 | "* Only difference, that instead of measuring it for the entire episode, we measure it for n-steps. " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": { 213 | "slideshow": { 214 | "slide_type": "subslide" 215 | } 216 | }, 217 | "source": [ 218 | "* The importance sampling ratio looks liks:\n", 219 | "
\"RewardHypothesis\"
\n", 220 | "* The update Equatin looks like this:\n", 221 | "
\"RewardHypothesis\"
\n", 222 | "\n", 223 | "
\"RewardHypothesis\"
\n" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "slideshow": { 230 | "slide_type": "skip" 231 | } 232 | }, 233 | "source": [ 234 | "## n-Step Off-Policy Control (with Importance Sampling)\n", 235 | "
\"RewardHypothesis\"
" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": { 241 | "slideshow": { 242 | "slide_type": "subslide" 243 | } 244 | }, 245 | "source": [ 246 | "## Off-Policy Control (w/o Importance Sampling => Tree BackUp Algorithm)\n", 247 | "* This backup is an alternating mix of sample transitions—from each action to the su bsequent state—and full backups—from each state we consider all the possible actions, their probability of occuring under π, and their action values.\n", 248 | "
\"RewardHypothesis\"
\n" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": { 254 | "slideshow": { 255 | "slide_type": "subslide" 256 | } 257 | }, 258 | "source": [ 259 | "* Returns and updates are calculated as follows:\n", 260 | "
\"RewardHypothesis\"
\n", 261 | "
\"RewardHypothesis\"
" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": { 267 | "slideshow": { 268 | "slide_type": "subslide" 269 | } 270 | }, 271 | "source": [ 272 | "## Unified view of Reinforcement Learning\n", 273 | "
\"RewardHypothesis\"
" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": { 279 | "slideshow": { 280 | "slide_type": "slide" 281 | } 282 | }, 283 | "source": [ 284 | "##
TD($\\lambda$)
" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": { 290 | "slideshow": { 291 | "slide_type": "subslide" 292 | } 293 | }, 294 | "source": [ 295 | "## Averaging n-Step Returns\n", 296 | "
\"RewardHypothesis\"
" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": { 302 | "slideshow": { 303 | "slide_type": "subslide" 304 | } 305 | }, 306 | "source": [ 307 | "## $\\lambda$ Returns\n", 308 | "
\"RewardHypothesis\"
" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": { 314 | "slideshow": { 315 | "slide_type": "subslide" 316 | } 317 | }, 318 | "source": [ 319 | "## TD(λ) Weighting Function\n", 320 | "
\"RewardHypothesis\"
" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": { 326 | "slideshow": { 327 | "slide_type": "subslide" 328 | } 329 | }, 330 | "source": [ 331 | "## Forward-view TD(λ)\n", 332 | "
\"RewardHypothesis\"
" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": { 338 | "slideshow": { 339 | "slide_type": "subslide" 340 | } 341 | }, 342 | "source": [ 343 | "## Bckward-view TD(λ)\n", 344 | "
\"RewardHypothesis\"
" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "collapsed": true, 351 | "slideshow": { 352 | "slide_type": "slide" 353 | } 354 | }, 355 | "source": [ 356 | "##
Eligibility Traces
" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": { 362 | "slideshow": { 363 | "slide_type": "subslide" 364 | } 365 | }, 366 | "source": [ 367 | "## Eligibility Traces" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "slideshow": { 374 | "slide_type": "fragment" 375 | } 376 | }, 377 | "source": [ 378 | "\n", 379 | "
\"RewardHypothesis\"
" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": { 385 | "slideshow": { 386 | "slide_type": "fragment" 387 | } 388 | }, 389 | "source": [ 390 | "\n", 391 | "
\"RewardHypothesis\"
" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": { 397 | "slideshow": { 398 | "slide_type": "subslide" 399 | } 400 | }, 401 | "source": [ 402 | "## Backward View TD(λ)\n", 403 | "
\"RewardHypothesis\"
" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": { 409 | "slideshow": { 410 | "slide_type": "subslide" 411 | } 412 | }, 413 | "source": [ 414 | "## TD(λ) and TD(0)\n", 415 | "
\"RewardHypothesis\"
" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": { 421 | "slideshow": { 422 | "slide_type": "subslide" 423 | } 424 | }, 425 | "source": [ 426 | "## TD(λ) and MC\n", 427 | "
\"RewardHypothesis\"
" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": { 433 | "slideshow": { 434 | "slide_type": "subslide" 435 | } 436 | }, 437 | "source": [ 438 | "## MC and TD(1)\n", 439 | "
\"RewardHypothesis\"
\n", 440 | "\n" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": { 446 | "slideshow": { 447 | "slide_type": "subslide" 448 | } 449 | }, 450 | "source": [ 451 | "## Telescoping in TD(1)\n", 452 | "
\"RewardHypothesis\"
" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": { 458 | "slideshow": { 459 | "slide_type": "subslide" 460 | } 461 | }, 462 | "source": [ 463 | "## TD(λ) and TD(1)\n", 464 | "
\"RewardHypothesis\"
" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": { 470 | "slideshow": { 471 | "slide_type": "subslide" 472 | } 473 | }, 474 | "source": [ 475 | "## Telescoping in TD(λ)\n", 476 | "
\"RewardHypothesis\"
" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": { 482 | "slideshow": { 483 | "slide_type": "subslide" 484 | } 485 | }, 486 | "source": [ 487 | "## Forwards and Backwards TD(λ)\n", 488 | "
\"RewardHypothesis\"
" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": { 494 | "slideshow": { 495 | "slide_type": "subslide" 496 | } 497 | }, 498 | "source": [ 499 | "## Offline Equivalence of Forward and Backward TD\n", 500 | "
\"RewardHypothesis\"
\n" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": { 506 | "slideshow": { 507 | "slide_type": "subslide" 508 | } 509 | }, 510 | "source": [ 511 | "## Online Equivalence of Forward and Backward TD\n", 512 | "
\"RewardHypothesis\"
" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": { 518 | "slideshow": { 519 | "slide_type": "subslide" 520 | } 521 | }, 522 | "source": [ 523 | "## Summary of Forward and Backward TD(λ)\n", 524 | "
\"RewardHypothesis\"
" 525 | ] 526 | } 527 | ], 528 | "metadata": { 529 | "anaconda-cloud": {}, 530 | "celltoolbar": "Slideshow", 531 | "kernelspec": { 532 | "display_name": "Python [conda root]", 533 | "language": "python", 534 | "name": "conda-root-py" 535 | }, 536 | "language_info": { 537 | "codemirror_mode": { 538 | "name": "ipython", 539 | "version": 3 540 | }, 541 | "file_extension": ".py", 542 | "mimetype": "text/x-python", 543 | "name": "python", 544 | "nbconvert_exporter": "python", 545 | "pygments_lexer": "ipython3", 546 | "version": "3.5.2" 547 | }, 548 | "widgets": { 549 | "state": {}, 550 | "version": "1.1.2" 551 | } 552 | }, 553 | "nbformat": 4, 554 | "nbformat_minor": 2 555 | } 556 | -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/RL_7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "#
Unifying MC Methods and TD Methods
\n", 12 | "##
Bootstrapping, TD($\\lambda$) and Eligibility Traces
\n", 13 | "###
Reference: Chapter 7 and Chapter 12, Sutton and Barto
" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "slideshow": { 20 | "slide_type": "slide" 21 | } 22 | }, 23 | "source": [ 24 | "## Contents\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "fragment" 32 | } 33 | }, 34 | "source": [ 35 | "* Bootstrapping" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "slideshow": { 42 | "slide_type": "fragment" 43 | } 44 | }, 45 | "source": [ 46 | "* TD ($\\lambda$)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "slideshow": { 53 | "slide_type": "fragment" 54 | } 55 | }, 56 | "source": [ 57 | "* Eligibility Traces" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "slideshow": { 64 | "slide_type": "slide" 65 | } 66 | }, 67 | "source": [ 68 | "##
Bootstrapping
" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "slideshow": { 75 | "slide_type": "subslide" 76 | } 77 | }, 78 | "source": [ 79 | "## Bootstrapping and Sampling\n", 80 | "
\"RewardHypothesis\"
" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "slideshow": { 87 | "slide_type": "subslide" 88 | } 89 | }, 90 | "source": [ 91 | "## Why do Bootstrapping?\n", 92 | "\n", 93 | "* Free from tyranny of the time step\n", 94 | "* Sometimes updates are required at every step of transition (one-step TD)\n", 95 | " * Take into account every possible transition/ anything that has changed\n", 96 | "* Sometimes, it makes sense to only update every few stansitions (multi-step TD)\n", 97 | " * Take into account significant/considerable changes" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "slideshow": { 104 | "slide_type": "subslide" 105 | } 106 | }, 107 | "source": [ 108 | "## n-Step Prediction\n", 109 | "
\"RewardHypothesis\"
" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "slideshow": { 116 | "slide_type": "subslide" 117 | } 118 | }, 119 | "source": [ 120 | "## n-Step Return\n", 121 | "
\"RewardHypothesis\"
" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "slideshow": { 128 | "slide_type": "subslide" 129 | } 130 | }, 131 | "source": [ 132 | "## n-Step Prediction\n", 133 | "\n", 134 | "
\"RewardHypothesis\"
" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "slideshow": { 141 | "slide_type": "subslide" 142 | } 143 | }, 144 | "source": [ 145 | "## n-Step SARSA (On-Policy Control)\n", 146 | "\n", 147 | "* simply switch states for actions (state–action pairs) and then use an $\\epsilon$-greedy policy.\n", 148 | "* The n-step returns in terms of estimated action values:\n", 149 | "
\"RewardHypothesis\"
\n", 150 | "* Update made to a particular value of action-pair is as follows:\n", 151 | "
\"RewardHypothesis\"
\n" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": { 157 | "slideshow": { 158 | "slide_type": "subslide" 159 | } 160 | }, 161 | "source": [ 162 | "## n-Step SARSA (On-policy Control)\n", 163 | "\n", 164 | "
\"RewardHypothesis\"
" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "slideshow": { 171 | "slide_type": "subslide" 172 | } 173 | }, 174 | "source": [ 175 | "# n-Step SARSA Example\n", 176 | "\n", 177 | "
\"RewardHypothesis\"
" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "slideshow": { 184 | "slide_type": "skip" 185 | } 186 | }, 187 | "source": [ 188 | "## n-Step SARSA (On-policy Control)\n", 189 | "\n", 190 | "
\"RewardHypothesis\"
" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "slideshow": { 197 | "slide_type": "subslide" 198 | } 199 | }, 200 | "source": [ 201 | "## n-Step Off-Policy Control (with Importance Sampling)\n", 202 | "\n", 203 | "* learning the value function for one policy, π, while following another policy, μ\n", 204 | "* Often, π is the greedy policy for the current action-value-function estimate, and μ is a more exploratory policy, perhaps ε-greedy\n", 205 | "* we must take into account the difference between the two policies, using their relative probability of taking the actions that were taken\n", 206 | "* To measure this difference, we use the importance sampling ratio. \n", 207 | "* Only difference, that instead of measuring it for the entire episode, we measure it for n-steps. " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": { 213 | "slideshow": { 214 | "slide_type": "subslide" 215 | } 216 | }, 217 | "source": [ 218 | "* The importance sampling ratio looks liks:\n", 219 | "
\"RewardHypothesis\"
\n", 220 | "* The update Equatin looks like this:\n", 221 | "
\"RewardHypothesis\"
\n", 222 | "\n", 223 | "
\"RewardHypothesis\"
\n" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "slideshow": { 230 | "slide_type": "skip" 231 | } 232 | }, 233 | "source": [ 234 | "## n-Step Off-Policy Control (with Importance Sampling)\n", 235 | "
\"RewardHypothesis\"
" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": { 241 | "slideshow": { 242 | "slide_type": "subslide" 243 | } 244 | }, 245 | "source": [ 246 | "## Off-Policy Control (w/o Importance Sampling => Tree BackUp Algorithm)\n", 247 | "* This backup is an alternating mix of sample transitions—from each action to the su bsequent state—and full backups—from each state we consider all the possible actions, their probability of occuring under π, and their action values.\n", 248 | "
\"RewardHypothesis\"
\n" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": { 254 | "slideshow": { 255 | "slide_type": "subslide" 256 | } 257 | }, 258 | "source": [ 259 | "* Returns and updates are calculated as follows:\n", 260 | "
\"RewardHypothesis\"
\n", 261 | "
\"RewardHypothesis\"
" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": { 267 | "slideshow": { 268 | "slide_type": "subslide" 269 | } 270 | }, 271 | "source": [ 272 | "## Unified view of Reinforcement Learning\n", 273 | "
\"RewardHypothesis\"
" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": { 279 | "slideshow": { 280 | "slide_type": "slide" 281 | } 282 | }, 283 | "source": [ 284 | "##
TD($\\lambda$)
" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": { 290 | "slideshow": { 291 | "slide_type": "subslide" 292 | } 293 | }, 294 | "source": [ 295 | "## Averaging n-Step Returns\n", 296 | "
\"RewardHypothesis\"
" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": { 302 | "slideshow": { 303 | "slide_type": "subslide" 304 | } 305 | }, 306 | "source": [ 307 | "## $\\lambda$ Returns\n", 308 | "
\"RewardHypothesis\"
" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": { 314 | "slideshow": { 315 | "slide_type": "subslide" 316 | } 317 | }, 318 | "source": [ 319 | "## TD(λ) Weighting Function\n", 320 | "
\"RewardHypothesis\"
" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": { 326 | "slideshow": { 327 | "slide_type": "subslide" 328 | } 329 | }, 330 | "source": [ 331 | "## Forward-view TD(λ)\n", 332 | "
\"RewardHypothesis\"
" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": { 338 | "slideshow": { 339 | "slide_type": "subslide" 340 | } 341 | }, 342 | "source": [ 343 | "## Bckward-view TD(λ)\n", 344 | "
\"RewardHypothesis\"
" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "collapsed": true, 351 | "slideshow": { 352 | "slide_type": "slide" 353 | } 354 | }, 355 | "source": [ 356 | "##
Eligibility Traces
" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": { 362 | "slideshow": { 363 | "slide_type": "subslide" 364 | } 365 | }, 366 | "source": [ 367 | "## Eligibility Traces" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "slideshow": { 374 | "slide_type": "fragment" 375 | } 376 | }, 377 | "source": [ 378 | "\n", 379 | "
\"RewardHypothesis\"
" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": { 385 | "slideshow": { 386 | "slide_type": "fragment" 387 | } 388 | }, 389 | "source": [ 390 | "\n", 391 | "
\"RewardHypothesis\"
" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": { 397 | "slideshow": { 398 | "slide_type": "subslide" 399 | } 400 | }, 401 | "source": [ 402 | "## Backward View TD(λ)\n", 403 | "
\"RewardHypothesis\"
" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": { 409 | "slideshow": { 410 | "slide_type": "subslide" 411 | } 412 | }, 413 | "source": [ 414 | "## TD(λ) and TD(0)\n", 415 | "
\"RewardHypothesis\"
" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": { 421 | "slideshow": { 422 | "slide_type": "subslide" 423 | } 424 | }, 425 | "source": [ 426 | "## TD(λ) and MC\n", 427 | "
\"RewardHypothesis\"
" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": { 433 | "slideshow": { 434 | "slide_type": "subslide" 435 | } 436 | }, 437 | "source": [ 438 | "## MC and TD(1)\n", 439 | "
\"RewardHypothesis\"
\n", 440 | "\n" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": { 446 | "slideshow": { 447 | "slide_type": "subslide" 448 | } 449 | }, 450 | "source": [ 451 | "## Telescoping in TD(1)\n", 452 | "
\"RewardHypothesis\"
" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": { 458 | "slideshow": { 459 | "slide_type": "subslide" 460 | } 461 | }, 462 | "source": [ 463 | "## TD(λ) and TD(1)\n", 464 | "
\"RewardHypothesis\"
" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": { 470 | "slideshow": { 471 | "slide_type": "subslide" 472 | } 473 | }, 474 | "source": [ 475 | "## Telescoping in TD(λ)\n", 476 | "
\"RewardHypothesis\"
" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": { 482 | "slideshow": { 483 | "slide_type": "subslide" 484 | } 485 | }, 486 | "source": [ 487 | "## Forwards and Backwards TD(λ)\n", 488 | "
\"RewardHypothesis\"
" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": { 494 | "slideshow": { 495 | "slide_type": "subslide" 496 | } 497 | }, 498 | "source": [ 499 | "## Offline Equivalence of Forward and Backward TD\n", 500 | "
\"RewardHypothesis\"
\n" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": { 506 | "slideshow": { 507 | "slide_type": "subslide" 508 | } 509 | }, 510 | "source": [ 511 | "## Online Equivalence of Forward and Backward TD\n", 512 | "
\"RewardHypothesis\"
" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": { 518 | "slideshow": { 519 | "slide_type": "subslide" 520 | } 521 | }, 522 | "source": [ 523 | "## Summary of Forward and Backward TD(λ)\n", 524 | "
\"RewardHypothesis\"
" 525 | ] 526 | } 527 | ], 528 | "metadata": { 529 | "anaconda-cloud": {}, 530 | "celltoolbar": "Slideshow", 531 | "kernelspec": { 532 | "display_name": "Python [conda root]", 533 | "language": "python", 534 | "name": "conda-root-py" 535 | }, 536 | "language_info": { 537 | "codemirror_mode": { 538 | "name": "ipython", 539 | "version": 3 540 | }, 541 | "file_extension": ".py", 542 | "mimetype": "text/x-python", 543 | "name": "python", 544 | "nbconvert_exporter": "python", 545 | "pygments_lexer": "ipython3", 546 | "version": "3.5.2" 547 | }, 548 | "widgets": { 549 | "state": {}, 550 | "version": "1.1.2" 551 | } 552 | }, 553 | "nbformat": 4, 554 | "nbformat_minor": 2 555 | } 556 | -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/1.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/10.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/10_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/10_1.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/10_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/10_2.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/11.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/12.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/13.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/14.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/15.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/16.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/17.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/18.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/19.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/2.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/20.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/21.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/3.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/4.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/5.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/6.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/7.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/8.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/9.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/bp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/bp.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/eg1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/eg1.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/eq1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/eq1.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/eq2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/eq2.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/importancesampling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/importancesampling.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/nStepOnline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/nStepOnline.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/prediction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/prediction.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/return.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/return.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/return1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/return1.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/sc1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/sc1.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/sc2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/sc2.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/update.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/update.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/img/update1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_7_rl_eligibility_traces/img/update1.png -------------------------------------------------------------------------------- /ch_7_rl_eligibility_traces/readme.md: -------------------------------------------------------------------------------- 1 | #
Unifying MC and TD Methods
2 | 3 | ####
Reference: Chapter 7 and Chapter 12, Sutton and Barto
4 | 5 | ## Contents: 6 | 7 | 1) **Bootstrapping** 8 | * Why do Bootstrapping? 9 | * n-Step SARSA (On-Policy Control) 10 | * n-Step Off-Policy Control (with Importance Sampling) 11 | * N-Step Off-Policy Control (w/o Importance Sampling => Tree BackUp Algorithm) 12 | 13 | 2) **TD($\lambda$)** 14 | * Forward View 15 | * Backward View 16 | 17 | 3) **Eligibility Traces** 18 | -------------------------------------------------------------------------------- /ch_8_model_based/img/dyna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dyna.jpeg -------------------------------------------------------------------------------- /ch_8_model_based/img/dyna_eq.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dyna_eq.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/dyna_perf.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dyna_perf.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/dyna_perf2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dyna_perf2.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/dynaenvchange.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dynaenvchange.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/dynaenvchange2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/dynaenvchange2.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/mcts0.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts0.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/mcts1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts1.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/mcts2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts2.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/mcts3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts3.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/mcts4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts4.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/mcts5.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mcts5.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/mctssearch1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mctssearch1.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/mctssearch2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mctssearch2.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/mctssteps.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/mctssteps.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/modelbasedplanning.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/modelbasedplanning.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/psweep.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/psweep.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/psweep_ex.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/psweep_ex.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/simmontesearch.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/simmontesearch.JPG -------------------------------------------------------------------------------- /ch_8_model_based/img/sslearning.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_8_model_based/img/sslearning.JPG -------------------------------------------------------------------------------- /ch_8_model_based/readme.md: -------------------------------------------------------------------------------- 1 | #
Planning and Learning with Tabular Methods
2 | 3 | ####
Reference: Chapter 8, Sutton and Barto
4 | 5 | ## Contents: 6 | 7 | 1) **Introduction** 8 | 9 | 2) **Model Based RL** 10 | 11 | 3) **Dyna: Integrating Planning, Acting and Learning** 12 | 13 | 4) **Prioritizing Sweeps** 14 | 15 | 5) **Planning as a part of Action Selection (Monte Carlo Tree Search)** 16 | -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_avg1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg1.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_avg2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg2.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_avg3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg3.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_avg4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg4.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_avg5.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg5.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_avg6.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg6.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_avg7.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg7.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_avg8.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg8.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_avg9.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_avg9.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_prob1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob1.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_prob2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob2.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_prob3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob3.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_prob4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob4.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_prob5.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob5.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_prob6.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob6.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_prob7.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob7.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_prob8.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_prob8.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slide1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slide1.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides10.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides10.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides11.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides11.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides12.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides12.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides13.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides13.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides14.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides14.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides2.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides3.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides4.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides5.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides5.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides6.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides6.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides7.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides7.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides8.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides8.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/fa_slides9.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/fa_slides9.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/img/func_approx.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/ch_9_func_approx_1/img/func_approx.JPG -------------------------------------------------------------------------------- /ch_9_func_approx_1/readme.md: -------------------------------------------------------------------------------- 1 | #
Value Function Approximation (Part I)
2 | 3 | ####
Reference: Chapter 9 to Chapter 11, Sutton and Barto
4 | 5 | ## Contents: 6 | 7 | 1) **Introduction** 8 | * Why move to non-tabular mthods? 9 | * Value Approximation 10 | 11 | 2) **Incremental Methods** 12 | * Gradient Descent 13 | * The Predictive Objective (MSVE) 14 | * SGD for MSVE 15 | * Feature Vector 16 | * Linear Function Approximator 17 | 18 | 3) **Incremental Prediction Methods** 19 | * Target as MC 20 | * Target as TD(0) 21 | * Target as TD($\lambda$) 22 | 23 | 4) **Iterative Control Approximation** 24 | * Target as MC 25 | * Target as TD(0) 26 | * Target as TD($\lambda$) 27 | 28 | 5) **Average Reward Setting** 29 | -------------------------------------------------------------------------------- /img/break_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/img/break_1.png -------------------------------------------------------------------------------- /img/motivation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/img/motivation.png -------------------------------------------------------------------------------- /img/statement_hinton_bengio_lecun.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BardOfCodes/DRL_in_CV/dc437b14f89b6b6484d563373349db13cb997f13/img/statement_hinton_bengio_lecun.png -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Deep Reinforcement Learning in Computer Vision 2 | 3 | [DRL in CV Website](https://bardofcodes.github.io/DRL_in_CV_Papers/) 4 | 5 | **[Google Slides on this introduction](https://docs.google.com/presentation/d/1Nnt6Jj77SLECbeb3m_Y6lqDEH0t8CUePJt2U-HlT6ZM/edit?usp=sharing)** 6 | 7 | In recent years, while use of Computer Vision techniques/models has burgeoned 8 | for solving Reinforcement Learning task(such as games), the opposite flow, of 9 | using techinques/models from Reinforcement Learning to solve paradigms in 10 | Computer Vision has also been seen. 11 | 12 | Additionally, from a few stalwarts of Computer Vision: 13 | 14 |
Bold Statement
15 | 16 | This indicates that just as researchers in Reinforcement learning benifited from 17 | understanding and applying Computer vision techniques, researchers in 18 | Computer Vision can benifit from not treating Reinforcement learning as an esoteric 19 | black box and gaining a comprehensive understanding of this subject. 20 | 21 | Hence, we are presenting a short series of lectures,(at our lab) with the following motivation: 22 | 23 |
motivations
24 | 25 | # DRL in CV Papers 26 | An Additional repository has been made [DRL_in_CV_Papers](https://github.com/BardOfCodes/DRL_in_CV_Papers), which consist of a list of published works in computer vision which use Deep Reinforcement learning. A few of the papers have an added blog-post on them as well, highlighting important parts of the paper. 27 | 28 | # Posts 29 | Additionally, for some topics which are important but might not have been a good slide presentation, we have made blog-like posts. This section will see further additions. 30 | It is open for additional posts from all. Kindly look in the `_post` folder for more information. 31 | 32 | # Acknowledgement 33 | 34 | We rely heavily on the following for the content. This work is mostly curation 35 | of the excellant material already provided by these brilliant creators: 36 | 37 | * Reinforcement Learning: An Introduction- Book by Andrew Barto and Richard S. Sutton. 38 | [Link to latest draft](ufal.mff.cuni.cz/~straka/courses/npfl114/2016/sutton-bookdraft2016sep.pdf). 39 | * UCL course on RL - Course by David Silver.[Link to material](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html). 40 | * Code material from various amazing sources: [DannyBritz](https://github.com/dennybritz/reinforcement-learning), 41 | [ShangtongZhang](https://github.com/ShangtongZhang/reinforcement-learning-an-introduction), 42 | [AndrejKarapathy](https://github.com/karpathy/reinforcejs). 43 | 44 | This work has been complied by Aditya Ganeshan and Trisha Mittal while working at [Video Analytics Lab(VAL),IISc](http://val.serc.iisc.ernet.in/valweb/). We thank the lab for giving us this opportunity. 45 | 46 | ##### Tutorials are still to be added for most chapters. 47 | --------------------------------------------------------------------------------