├── .gitignore ├── Amazon GPU howto.md ├── Dockerfile ├── LICENSE.md ├── README.md ├── docker ├── Dockerfile ├── README.md └── run_jupyter.sh ├── week1_intro ├── README.md ├── crossentropy_method.ipynb ├── pong.py ├── primer_python_for_ml │ ├── recap_ml.ipynb │ └── train.csv ├── project_starter_evolution_strategies.ipynb └── seminar_gym_interface.ipynb ├── week2_value_based ├── README.md ├── mdp.py ├── seminar1_VI.ipynb └── seminar2_MCTS.ipynb ├── week3_model_free ├── README.md ├── homework │ ├── expected_value_sarsa.py │ ├── expected_value_sarsa_epsilon_annealing.py │ ├── homework.ipynb │ ├── q_learning_epsilon_annealing.py │ ├── qlearning.py │ └── sarsa.py ├── seminar_alternative │ ├── qlearning.py │ └── seminar.ipynb └── seminar_main │ ├── analysis.py │ ├── crawler.py │ ├── environment.py │ ├── featureExtractors.py │ ├── game.py │ ├── ghostAgents.py │ ├── graphicsCrawlerDisplay.py │ ├── graphicsDisplay.py │ ├── graphicsGridworldDisplay.py │ ├── graphicsUtils.py │ ├── gridworld.py │ ├── how2run │ ├── keyboardAgents.py │ ├── layout.py │ ├── layouts │ ├── capsuleClassic.lay │ ├── contestClassic.lay │ ├── mediumClassic.lay │ ├── mediumGrid.lay │ ├── minimaxClassic.lay │ ├── openClassic.lay │ ├── originalClassic.lay │ ├── smallClassic.lay │ ├── smallGrid.lay │ ├── testClassic.lay │ ├── trappedClassic.lay │ └── trickyClassic.lay │ ├── learningAgents.py │ ├── mdp.py │ ├── pacman.py │ ├── pacmanAgents.py │ ├── qlearningAgents.py │ ├── run_crawler.sh │ ├── run_grid.sh │ ├── run_pacman.sh │ ├── textDisplay.py │ ├── textGridworldDisplay.py │ └── util.py ├── week4_[recap]_deep_learning ├── README.md ├── fix_my_nn.ipynb ├── mnist.py ├── notmnist.py ├── practice_lasagne.ipynb ├── practice_tensorflow.ipynb └── seminar_pytorch.ipynb ├── week4_approx_rl ├── README.md ├── framebuffer.py ├── homework_lasagne.ipynb ├── homework_pytorch.ipynb ├── homework_tf.ipynb ├── replay_buffer.py ├── seminar_lasagne.ipynb ├── seminar_pytorch.ipynb └── seminar_tf.ipynb ├── week5_explore ├── README.md ├── action_rewards.npy ├── all_states.npy ├── bayes.py ├── bnn.png ├── river_swim.png └── week5.ipynb ├── week6_policy_based ├── README.md ├── atari_util.py ├── homework_lasagne.ipynb ├── homework_tensorflow.ipynb ├── reinforce_lasagne.ipynb ├── reinforce_pytorch.ipynb └── reinforce_tensorflow.ipynb ├── week7_[recap]_rnn ├── README.md ├── mtg_card_names.txt ├── names ├── rnn.png ├── seminar_lasagne.ipynb ├── seminar_lasagne_ingraph.ipynb ├── seminar_pytorch.ipynb └── seminar_tf.ipynb ├── week7_pomdp ├── README.md ├── atari_util.py ├── env_pool.py ├── homework_common_part2.ipynb ├── img1.jpg ├── img2.jpg ├── img3.jpg ├── practice_pytorch.ipynb ├── practice_tensorflow.ipynb ├── practice_theano.ipynb └── theano_optional_recurrence_tutorial.ipynb ├── week8_scst ├── README.md ├── basic_model_tf.py ├── basic_model_theano.py ├── basic_model_torch.py ├── bonus.ipynb ├── he-pron-wiktionary.txt ├── main_dataset.txt ├── practice_tf.ipynb ├── practice_theano.ipynb ├── practice_torch.ipynb ├── scheme.svg └── voc.py ├── week9_policy_II ├── README.md ├── seminar_TRPO_pytorch.ipynb ├── seminar_TRPO_tensorflow.ipynb └── seminar_TRPO_theano.ipynb ├── xvfb ├── yet_another_week ├── README.md └── _resource │ ├── README.md │ ├── a3c_scheme.odp │ ├── conv_salary_architecture.odp │ ├── conv_salary_architecture.png │ ├── do_something_scst.png │ ├── dqn_arch.odp │ ├── dqn_arch.png │ ├── env_pool.png │ ├── exp_replay.odp │ ├── exp_replay.png │ ├── nerd.png │ ├── nnet_arch.odp │ ├── nnet_arch.png │ ├── pomdp_arch.odp │ ├── pomdp_arch.pdf │ ├── pomdp_arch.png │ ├── pomdp_img1.jpg │ ├── pomdp_img2.jpg │ ├── pomdp_img3.jpg │ ├── qlearning_scheme.odp │ ├── qlearning_scheme.pgm │ ├── qlearning_scheme.png │ ├── rollout.png │ ├── scheme.png │ ├── target_net.odp │ ├── target_net.png │ └── training.png └── youtube_dl_lectures.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # node and NPM 2 | npm-debug.log 3 | node_modules 4 | ..bfg-report 5 | 6 | # swap files 7 | *~ 8 | *.swp 9 | 10 | 11 | 12 | env.sh 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | env/ 23 | bin/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg/ 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | 49 | # Translations 50 | *.mo 51 | 52 | # Mr Developer 53 | .mr.developer.cfg 54 | .project 55 | .pydevproject 56 | .idea 57 | .ipynb_checkpoints 58 | 59 | # Rope 60 | .ropeproject 61 | 62 | # Django stuff: 63 | *.log 64 | *.pot 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | docs/tmp* 69 | 70 | # OS X garbage 71 | .DS_Store 72 | 73 | # Debian things 74 | debian/reproducible-experiment-platform 75 | debian/files 76 | *.substvars 77 | *.debhelper.log 78 | -------------------------------------------------------------------------------- /Amazon GPU howto.md: -------------------------------------------------------------------------------- 1 | # How to set up GPU on EC2 instance 2 | 3 | ## Create EC2 instance 4 | 5 | Use `p2.xlarge` instance type and `ami-e00a8180` AMI image. [Details](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html) 6 | 7 | Open ports `22` (ssh) and `80` (http) on your freshly created instance, 8 | you create a [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) 9 | and attach it your instance to get ports open 10 | 11 | ## Launch notebook 12 | 13 | Instance you have created contains all you need: fresh versions of theano, lasagne, CUDA driver and cuDNN, 14 | just lunch ipython and get hands dirty: 15 | 16 | ```bash 17 | $ sudo su 18 | $ export THEANO_FLAGS='cuda.root=/usr/local/cuda,device=gpu,floatX=float32' 19 | $ export PATH=/usr/local/cuda-8.0/bin${PATH:+:${PATH}} 20 | $ jupyter notebook 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM andrewosh/binder-base 2 | MAINTAINER Alexander Panin 3 | USER root 4 | 5 | RUN echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" >> /etc/apt/sources.list 6 | RUN apt-get -qq update 7 | 8 | RUN apt-get install -y gcc-4.9 g++-4.9 libstdc++6 wget unzip 9 | RUN apt-get install -y libopenblas-dev liblapack-dev libsdl2-dev libboost-all-dev graphviz 10 | RUN apt-get install -y cmake zlib1g-dev libjpeg-dev 11 | RUN apt-get install -y xvfb libav-tools xorg-dev python-opengl python3-opengl 12 | RUN apt-get -y install swig3.0 13 | RUN ln -s /usr/bin/swig3.0 /usr/bin/swig 14 | 15 | 16 | USER main 17 | RUN pip install --upgrade pip==9.0.3 18 | RUN pip install --upgrade --ignore-installed setuptools #fix https://github.com/tensorflow/tensorflow/issues/622 19 | RUN pip install --upgrade sklearn tqdm nltk editdistance joblib graphviz 20 | 21 | # install all gym stuff except mujoco - it fails at "import importlib.util" (no module named util) 22 | RUN pip install --upgrade gym 23 | RUN pip install --upgrade gym[atari] 24 | RUN pip install --upgrade gym[box2d] 25 | 26 | RUN pip install --upgrade http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl 27 | RUN pip install --upgrade torchvision 28 | RUN pip install --upgrade keras 29 | RUN pip install --upgrade https://github.com/Theano/Theano/archive/master.zip 30 | RUN pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip 31 | RUN pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip 32 | RUN pip install gym_pull 33 | RUN pip install ppaquette-gym-doom 34 | 35 | 36 | 37 | 38 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade pip==9.0.3 39 | 40 | # fix https://github.com/tensorflow/tensorflow/issues/622 41 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade --ignore-installed setuptools 42 | 43 | # python3: fix `GLIBCXX_3.4.20' not found - conda's libgcc blocked system's gcc-4.9 and libstdc++6 44 | RUN bash -c "conda update -y conda && source activate python3 && conda uninstall -y libgcc && source deactivate" 45 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade matplotlib numpy scipy pandas graphviz 46 | 47 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade sklearn tqdm nltk editdistance joblib 48 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade --ignore-installed setuptools #fix https://github.com/tensorflow/tensorflow/issues/622 49 | 50 | # install all gym stuff except mujoco - it fails at "mjmodel.h: no such file or directory" 51 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym 52 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym[atari] 53 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym[box2d] 54 | 55 | 56 | 57 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl 58 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade torchvision 59 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade keras 60 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Theano/Theano/archive/master.zip 61 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip 62 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip 63 | 64 | #install TF after everything else not to break python3's pyglet with python2's tensorflow 65 | RUN pip install --upgrade tensorflow==1.4.0 66 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade tensorflow==1.4.0 67 | #TODO py3 doom once it's no longer broken 68 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Practical_RL 2 | A course on reinforcement learning in the wild. 3 | Taught on-campus at [HSE](https://cs.hse.ru) and [YSDA](https://yandexdataschool.com/) and maintained to be friendly to online students (both english and russian). 4 | 5 | 6 | #### Manifesto: 7 | * __Optimize for the curious.__ For all the materials that aren’t covered in detail there are links to more information and related materials (D.Silver/Sutton/blogs/whatever). Assignments will have bonus sections if you want to dig deeper. 8 | * __Practicality first.__ Everything essential to solving reinforcement learning problems is worth mentioning. We won't shun away from covering tricks and heuristics. For every major idea there should be a lab that makes you to “feel” it on a practical problem. 9 | * __Git-course.__ Know a way to make the course better? Noticed a typo in a formula? Found a useful link? Made the code more readable? Made a version for alternative framework? You're awesome! [Pull-request](https://help.github.com/articles/about-pull-requests/) it! 10 | 11 | # Course info 12 | * Lecture slides are [here](https://yadi.sk/d/loPpY45J3EAYfU). 13 | * Telegram chat room for YSDA & HSE students is [here](https://t.me/rlspring18) 14 | * Grading rules for YSDA & HSE students is [here](https://github.com/yandexdataschool/Practical_RL/wiki/Homeworks-and-grading) 15 | * Online student __[survival guide](https://github.com/yandexdataschool/Practical_RL/wiki/Online-student's-survival-guide)__ 16 | * Installing the libraries - [guide and issues thread](https://github.com/yandexdataschool/Practical_RL/issues/1) 17 | * Magical button that launches you into course environment: 18 | * [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/yandexdataschool/Practical_RL/master) - comes will all libraries pre-installed. May be down time to time. 19 | * If it's down, try [__google colab__](https://colab.research.google.com/) or [__azure notebooks__](http://notebooks.azure.com/). Those last longer, but they will require you to run installer commands (see ./Dockerfile). 20 | * Anonymous [feedback form](https://docs.google.com/forms/d/e/1FAIpQLSdurWw97Sm9xCyYwC8g3iB5EibITnoPJW2IkOVQYE_kcXPh6Q/viewform) for everything that didn't go through e-mail. 21 | * [About the course](https://github.com/yandexdataschool/Practical_RL/wiki/Practical-RL) 22 | 23 | # Additional materials 24 | * A large list of RL materials - [awesome rl](https://github.com/aikorea/awesome-rl) 25 | * [RL reading group](https://github.com/yandexdataschool/Practical_RL/wiki/RL-reading-group) 26 | 27 | 28 | # Syllabus 29 | 30 | The syllabus is approximate: the lectures may occur in a slightly different order and some topics may end up taking two weeks. 31 | 32 | * [__week1__](https://github.com/yandexdataschool/Practical_RL/tree/master/week1_intro) RL as blackbox optimization 33 | * Lecture: RL problems around us. Decision processes. Stochastic optimization, Crossentropy method. Parameter space search vs action space search. 34 | * Seminar: Welcome into openai gym. Tabular CEM for Taxi-v0, deep CEM for box2d environments. 35 | * Homework description - see week1/README.md. 36 | * ** YSDA Deadline: 2018.02.26 23.59** 37 | * ** HSE Deadline: 2018.01.28 23:59** 38 | 39 | * [__week2__](https://github.com/yandexdataschool/Practical_RL/tree/master/week2_value_based) Value-based methods 40 | * Lecture: Discounted reward MDP. Value-based approach. Value iteration. Policy iteration. Discounted reward fails. 41 | * Seminar: Value iteration. 42 | * Homework description - see week2/README.md. 43 | * ** HSE Deadline: 2018.02.11 23:59** 44 | * ** YSDA Deadline: part1 2018.03.05 23.59, part2 2018.03.12 23.59** 45 | 46 | 47 | * [__week3__](https://github.com/yandexdataschool/Practical_RL/tree/master/week3_model_free) Model-free reinforcement learning 48 | * Lecture: Q-learning. SARSA. Off-policy Vs on-policy algorithms. N-step algorithms. TD(Lambda). 49 | * Seminar: Qlearning Vs SARSA Vs Expected Value SARSA 50 | * Homework description - see week3/README.md. 51 | * **HSE Deadline: 2018.02.15 23:59** 52 | * ** YSDA Deadline: 2018.03.12 23.59** 53 | 54 | * [__week4_recap__](https://github.com/yandexdataschool/Practical_RL/tree/master/week4_%5Brecap%5D_deep_learning) - deep learning recap 55 | * Lecture: Deep learning 101 56 | * Seminar: Simple image classification with convnets 57 | 58 | * [__week4__](https://github.com/yandexdataschool/Practical_RL/tree/master/week4_approx_rl) Approximate reinforcement learning 59 | * Lecture: Infinite/continuous state space. Value function approximation. Convergence conditions. Multiple agents trick; experience replay, target networks, double/dueling/bootstrap DQN, etc. 60 | * Seminar: Approximate Q-learning with experience replay. (CartPole, Atari) 61 | * **HSE Deadline: 2018.03.04 23:30** 62 | * ** YSDA Deadline: 2018.03.20 23.30** 63 | 64 | * [__week5__](https://github.com/yandexdataschool/Practical_RL/tree/master/week5_explore) Exploration in reinforcement learning 65 | * Lecture: Contextual bandits. Thompson Sampling, UCB, bayesian UCB. Exploration in model-based RL, MCTS. "Deep" heuristics for exploration. 66 | * Seminar: bayesian exploration for contextual bandits. UCB for MCTS. 67 | 68 | * ** YSDA Deadline: 2018.03.30 23.30** 69 | 70 | * [__week6__](https://github.com/yandexdataschool/Practical_RL/tree/master/week6_policy_based) Policy gradient methods I 71 | * Lecture: Motivation for policy-based, policy gradient, logderivative trick, REINFORCE/crossentropy method, variance reduction(baseline), advantage actor-critic (incl. GAE) 72 | * Seminar: REINFORCE, advantage actor-critic 73 | 74 | * [__week7_recap__](https://github.com/yandexdataschool/Practical_RL/tree/master/week7_%5Brecap%5D_rnn) Recurrent neural networks recap 75 | * Lecture: Problems with sequential data. Recurrent neural netowks. Backprop through time. Vanishing & exploding gradients. LSTM, GRU. Gradient clipping 76 | * Seminar: character-level RNN language model 77 | 78 | * [__week7__](https://github.com/yandexdataschool/Practical_RL/tree/master/week7_pomdp) Partially observable MDPs 79 | * Lecture: POMDP intro. POMDP learning (agents with memory). POMDP planning (POMCP, etc) 80 | * Seminar: Deep kung-fu & doom with recurrent A3C and DRQN 81 | 82 | * [__week8__](https://github.com/yandexdataschool/Practical_RL/tree/master/week8_scst) Applications II 83 | * Lecture: Reinforcement Learning as a general way to optimize non-differentiable loss. G2P, machine translation, conversation models, image captioning, discrete GANs. Self-critical sequence training. 84 | * Seminar: Simple neural machine translation with self-critical sequence training 85 | 86 | * [__week9__](https://github.com/yandexdataschool/Practical_RL/tree/master/week9_policy_II) Policy gradient methods II 87 | * Lecture: Trust region policy optimization. NPO/PPO. Deterministic policy gradient. DDPG. Bonus: DPG for discrete action spaces. 88 | * Seminar: Approximate TRPO for simple robotic tasks. 89 | 90 | * [Some after-course bonus materials](https://github.com/yandexdataschool/Practical_RL/tree/master/yet_another_week) 91 | 92 | 93 | # Course staff 94 | Course materials and teaching by: _[unordered]_ 95 | - [Pavel Shvechikov](https://github.com/bestxolodec) - lectures, seminars, hw checkups, reading group 96 | - [Oleg Vasilev](https://github.com/Omrigan) - seminars, hw checkups, technical support 97 | - [Alexander Fritsler](https://github.com/Fritz449) - lectures, seminars, hw checkups 98 | - [Nikita Putintsev](https://github.com/qwasser) - seminars, hw checkups, organizing our hot mess 99 | - [Fedor Ratnikov](https://github.com/justheuristic/) - lectures, seminars, hw checkups 100 | - [Alexey Umnov](https://github.com/alexeyum) - seminars, hw checkups 101 | 102 | # Contributions 103 | * Using pictures from [Berkeley AI course](http://ai.berkeley.edu/home.html) 104 | * Massively refering to [CS294](http://rll.berkeley.edu/deeprlcourse/) 105 | * Sevaral tensorflow assignments by [Scitator](https://github.com/Scitator) 106 | * A lot of fixes from [arogozhnikov](https://github.com/arogozhnikov) 107 | * Other awesome people: see github contributors 108 | 109 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | LABEL maintainer "Alexander Panin , Dmitry Mittov " 3 | 4 | 5 | RUN echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" >> /etc/apt/sources.list && \ 6 | apt-get -qq update && \ 7 | apt-get install -y cmake \ 8 | wget \ 9 | unzip \ 10 | git \ 11 | zlib1g-dev \ 12 | libjpeg-dev \ 13 | xvfb \ 14 | libav-tools \ 15 | xorg-dev \ 16 | python-opengl \ 17 | swig3.0 \ 18 | python-dev \ 19 | python3-dev \ 20 | python-pip \ 21 | python3-pip \ 22 | libopenblas-dev \ 23 | liblapack-dev \ 24 | libsdl2-dev \ 25 | libboost-all-dev \ 26 | graphviz \ 27 | gcc \ 28 | g++ && \ 29 | ln -s /usr/bin/swig3.0 /usr/bin/swig 30 | 31 | RUN pip install --upgrade pip==9.0.3 && \ 32 | pip install --upgrade numpy scipy && \ 33 | pip install --upgrade sklearn \ 34 | jupyter \ 35 | tqdm \ 36 | graphviz \ 37 | gym gym[box2d] gym[atari] \ 38 | matplotlib \ 39 | seaborn && \ 40 | pip install --upgrade https://github.com/Theano/Theano/archive/master.zip \ 41 | https://github.com/Lasagne/Lasagne/archive/master.zip \ 42 | https://github.com/yandexdataschool/AgentNet/archive/master.zip \ 43 | tensorflow \ 44 | http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl \ 45 | torchvision \ 46 | keras 47 | 48 | RUN pip install --upgrade gym_pull ppaquette-gym-doom 49 | 50 | 51 | RUN pip3 install --upgrade pip==9.0.3 && \ 52 | pip3 install --upgrade numpy scipy && \ 53 | pip3 install --upgrade sklearn \ 54 | jupyter \ 55 | tqdm \ 56 | graphviz \ 57 | gym gym[box2d] gym[atari] \ 58 | matplotlib \ 59 | seaborn && \ 60 | pip3 install --upgrade https://github.com/Theano/Theano/archive/master.zip \ 61 | https://github.com/Lasagne/Lasagne/archive/master.zip \ 62 | https://github.com/yandexdataschool/AgentNet/archive/master.zip \ 63 | http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl \ 64 | torchvision \ 65 | tensorflow \ 66 | keras && \ 67 | python3 -m ipykernel.kernelspec 68 | 69 | 70 | EXPOSE 8888 71 | VOLUME /notebooks 72 | WORKDIR /notebooks 73 | 74 | COPY run_jupyter.sh / 75 | CMD ["/run_jupyter.sh"] 76 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | To simplify installation process, you can deploy a container (~virtual machine) with all dependencies pre-installed. 2 | 3 | _tl;dr [dockerhub url](https://hub.docker.com/r/justheuristic/practical_rl/)_ 4 | 5 | ## Install Docker 6 | 7 | We recommend you to use either native docker (recommended for linux) or kitematic(recommended for windows). 8 | * Installing [kitematic](https://kitematic.com/), a simple interface to docker (all platforms) 9 | * Pure docker: Guide for [windows](https://docs.docker.com/docker-for-windows/), [linux](https://docs.docker.com/engine/installation/), or [macOS](https://docs.docker.com/docker-for-mac/). 10 | 11 | Below are the instructions for both approaches. 12 | 13 | ## Kitematic 14 | Find justheuristic/practical_rl in the search menu. Download and launch the container. 15 | 16 | Click on "web preview" screen in the top-right __or__ go to settings, ports and find at which port your jupyter is located, usually 32***. 17 | 18 | ## Native 19 | `docker run -it -v :/notebooks -p :8888 justheuristic/practical_rl sh ../run_jupyter.sh` 20 | 21 | `docker run -it -v /Users/mittov/Documents/shad/semester4/:/notebooks -p 8888:8888 justheuristic/practical_rl sh ../run_jupyter.sh` 22 | 23 | ## Manual 24 | Build container 25 | 26 | `$ docker build -t rl .` 27 | 28 | 29 | Run it 30 | 31 | `$ docker run --rm -it -v :/notebooks -p :8888 dl` 32 | 33 | examples: 34 | 35 | `$ docker run --rm -it -v /Users/mittov/Documents/shad/semester4/:/notebooks -p 8888:8888 dl` 36 | 37 | Copy the token from console and run 38 | http://localhost:8888/?token= 39 | -------------------------------------------------------------------------------- /docker/run_jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | jupyter notebook --no-browser --allow-root --ip 0.0.0.0 3 | 4 | -------------------------------------------------------------------------------- /week1_intro/README.md: -------------------------------------------------------------------------------- 1 | ## Materials: 2 | * [__Lecture slides__](https://yadi.sk/i/sbc0ZCKx3RRGbW) 3 | * __Russian:__ 4 | * Intro to RL - [video](https://yadi.sk/i/bMo0qa-x3DoqkS) 5 | * Blackbox optiization - [video](https://yadi.sk/i/5yf_4oGI3EDJhJ) 6 | * Seminar - [video](https://yadi.sk/i/dPsWYMK13EDJj7) _only covering crossentropy method_ 7 | 8 | * __English:__ 9 | * [__main__] Video-intro by David Silver (english) - [video](https://www.youtube.com/watch?v=2pWv7GOvuf0) 10 | * [__main__] Lecture by J Schulman with crossentropy method explained (english) - [url](https://www.youtube.com/watch?v=aUrX-rP_ss4&list=PLCTc_C7itk-GaAMxmlChrkPnGKtjz8hv1) 11 | * Optional lecture by David Silver (english) - [video](https://www.youtube.com/watch?v=lfHX2hHRMVQ) 12 | 13 | 14 | ## More materials: 15 | * __[recommended]__ - awesome openai post about evolution strategies - [blog post](https://blog.openai.com/evolution-strategies/), [article](https://arxiv.org/abs/1703.03864) 16 | * Deep learning course (if you want to learn in parallel) - https://github.com/yandexdataschool/HSE_deeplearning 17 | * Video on genetic algorithms (english) - [video](https://www.youtube.com/watch?v=ejxfTy4lI6I) 18 | * Another guide to genetic algorithm (english) - [video](https://www.youtube.com/watch?v=zwYV11a__HQ) 19 | * About Differential evolution (english) - [pdf](http://jvanderw.une.edu.au/DE_1.pdf) 20 | * Video on Ant Colony Algorithm (english) - [video](https://www.youtube.com/watch?v=D58nLNLkb0I) 21 | * Longer video on Ant Colony Algorithm (english) - [video](https://www.youtube.com/watch?v=xpyKmjJuqhk) 22 | 23 | 24 | ## Homework description 25 | * Open `gym_interface.ipynb` and follow instructions from there 26 | * If you haven't installed everything yet, try [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/yandexdataschool/Practical_RL/master) 27 | * After you're done there, proceed to `crossentropy_method.ipynb` 28 | * You can find homework and bonus assignment descriptions at the end of that notebook. 29 | * Note: so far it's enough to say `pip install gym` on top of any data-science-stuffed python, but we'd appreciate if you gradually switch to [full installation](https://github.com/openai/gym#installing-everything). 30 | 31 | 32 | -------------------------------------------------------------------------------- /week1_intro/pong.py: -------------------------------------------------------------------------------- 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient""" 2 | import numpy as np 3 | import gym 4 | from scipy.misc import imresize 5 | from gym.core import Wrapper 6 | from gym.spaces.box import Box 7 | 8 | def make_pong(): 9 | """creates breakout env with all preprocessing done for you""" 10 | return PreprocessAtari(gym.make("PongDeterministic-v0")) 11 | 12 | class PreprocessAtari(Wrapper): 13 | def __init__(self,env,height=42,width=42, 14 | crop=lambda img: img[34:34+160],n_frames=4): 15 | """A gym wrapper that reshapes, crops and scales image into the desired shapes""" 16 | super(PreprocessAtari, self).__init__(env) 17 | self.img_size = (height,width) 18 | self.crop=crop 19 | self.observation_space = Box(0.0, 1.0, [n_frames,height,width]) 20 | self.framebuffer = np.zeros([n_frames,height,width]) 21 | def reset(self): 22 | """resets breakout, returns initial frames""" 23 | self.framebuffer = np.zeros_like(self.framebuffer) 24 | self.update_buffer(self.env.reset()) 25 | return self.framebuffer 26 | def step(self,action): 27 | """plays breakout for 1 step, returns 4-frame buffer""" 28 | new_img,r,done,info = self.env.step(action) 29 | self.update_buffer(new_img) 30 | return self.framebuffer,r,done,info 31 | 32 | ###image processing### 33 | 34 | def update_buffer(self,img): 35 | img = self.preproc_image(img) 36 | self.framebuffer = np.vstack([img[None], self.framebuffer[:-1]]) 37 | 38 | def preproc_image(self, img): 39 | """what happens to the observation""" 40 | img = self.crop(img) 41 | img = imresize(img, self.img_size).mean(-1) 42 | img = img.astype('float32')/255. 43 | return img 44 | -------------------------------------------------------------------------------- /week1_intro/project_starter_evolution_strategies.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Project :: Evolution Strategies\n", 8 | "\n", 9 | "![img](https://t4.ftcdn.net/jpg/00/17/46/81/240_F_17468143_wY3hsHyfNYoMdG9BlC56HI4JA7pNu63h.jpg)\n", 10 | "\n", 11 | "Remember the idea behind Evolution Strategies? Here's a neat [blog post](https://blog.openai.com/evolution-strategies/) about 'em.\n", 12 | "\n", 13 | "Can you reproduce their success? You will have to implement evolutionary strategies and see how they work.\n", 14 | "\n", 15 | "This project is optional; has several milestones each worth a number of points [and swag].\n", 16 | "\n", 17 | "__Milestones:__\n", 18 | "* [10pts] Basic prototype of evolutionary strategies that works in one thread on CartPole\n", 19 | "* [+5pts] Modify the code to make them work in parallel\n", 20 | "* [+5pts] if you can run ES distributedly on at least two PCs\n", 21 | "* [+10pts] Apply ES to play Atari Pong at least better than random\n", 22 | "* [++] Additional points for all kinds of cool stuff besides milestones\n", 23 | "\n", 24 | "__Rules:__\n", 25 | "\n", 26 | "* This is __not a mandatory assignment__, but it's a way to learn some cool things if you're getting bored with default assignments.\n", 27 | "* Once you decided to take on this project, please tell any of course staff members so that we can help ypu if you get stuck.\n", 28 | "* There's a default implementation of ES in this [openai repo](https://github.com/openai/evolution-strategies-starter). It's okay to look there if you get stuck or want to compare your solutions, but each copy-pasted chunk of code should be understood thoroughly. We'll test that with questions." 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "### Tips on implementation\n", 36 | "\n", 37 | "* It would be very convenient later if you implemented a function that takes policy weights, generates a session and returns policy changes -- so that you could then run a bunch of them in parallel.\n", 38 | "\n", 39 | "* The simplest way you can do multiprocessing is to use [joblib](https://www.google.com/search?client=ubuntu&channel=fs&q=joblib&ie=utf-8&oe=utf-8)\n", 40 | "\n", 41 | "* For joblib, make sure random variables are independent in each job. Simply add `np.random.seed()` at the beginning of your \"job\" function.\n", 42 | "\n", 43 | "Later once you got distributed, you may need a storage that gathers gradients from all workers. In such case we recommend [Redis](https://redis.io/) due to it's simplicity.\n", 44 | "\n", 45 | "Here's a speed-optimized saver/loader to store numpy arrays in Redis as strings.\n", 46 | "\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "import joblib\n", 58 | "from six import BytesIO\n", 59 | "def dumps(data):\n", 60 | " \"\"\"converts whatever to string\"\"\"\n", 61 | " s = BytesIO()\n", 62 | " joblib.dump(data,s)\n", 63 | " return s.getvalue()\n", 64 | " \n", 65 | "def loads(self,string):\n", 66 | " \"\"\"converts string to whatever was dumps'ed in it\"\"\"\n", 67 | " return joblib.load(BytesIO(string))\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### Tips on atari games\n", 75 | "* There's all the pre-processing and tuning done for you in the code below\n", 76 | " * Images rescaled to 42x42 to speed up computation\n", 77 | " * We use last 4 frames as observations to account for ball velocity\n", 78 | " * The code below requires ```pip install Image``` and ```pip install gym[atari]``` \n", 79 | " * You may also need some dependencies for gym[atari] - google \"gym install all\" dependencies or use our pre-built environment.\n", 80 | "* The recommended agent architecture is a convolutional neural network. Dense network will also do.\n", 81 | "\n", 82 | "\n", 83 | "May the force be with you!" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "from pong import make_pong\n", 95 | "import numpy as np\n", 96 | "\n", 97 | "env = make_pong()\n", 98 | "print(env.action_space)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "#get the initial state\n", 110 | "s = env.reset()\n", 111 | "print (s.shape)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "import matplotlib.pyplot as plt\n", 123 | "%matplotlib inline\n", 124 | "#plot first observation. Only one frame\n", 125 | "plt.imshow(s.swapaxes(1,2).reshape(-1,s.shape[-1]).T)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "#next frame\n", 137 | "new_s,r,done, _ = env.step(env.action_space.sample())\n", 138 | "plt.imshow(new_s.swapaxes(1,2).reshape(-1,s.shape[-1]).T)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "#after 10 frames\n", 150 | "for _ in range(10):\n", 151 | " new_s,r,done, _ = env.step(env.action_space.sample())\n", 152 | "\n", 153 | "plt.imshow(new_s.swapaxes(1,2).reshape(-1,s.shape[-1]).T,vmin=0)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "< tons of your code here or elsewhere >" 165 | ] 166 | } 167 | ], 168 | "metadata": { 169 | "kernelspec": { 170 | "display_name": "Python 2", 171 | "language": "python", 172 | "name": "python2" 173 | }, 174 | "language_info": { 175 | "codemirror_mode": { 176 | "name": "ipython", 177 | "version": 2 178 | }, 179 | "file_extension": ".py", 180 | "mimetype": "text/x-python", 181 | "name": "python", 182 | "nbconvert_exporter": "python", 183 | "pygments_lexer": "ipython2", 184 | "version": "2.7.13" 185 | } 186 | }, 187 | "nbformat": 4, 188 | "nbformat_minor": 2 189 | } 190 | -------------------------------------------------------------------------------- /week2_value_based/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [__Lecture slides__](https://docs.google.com/presentation/d/1Tnt4w0DDCwgGIo8Dh9-004veILxHekXOlTddmz0O5Tc/edit?usp=sharing) 3 | * Our videos: [lecture](https://yadi.sk/i/PeaLQ3IG3SeZML) [seminar](https://yadi.sk/i/hrnHB9DK3SeZRC) (russian) 4 | * __[main]__ lecture by David Silver - [url](https://www.youtube.com/watch?v=Nd1-UUMVfz4) 5 | * Alternative lecture by Pieter Abbeel (english): [part 1](https://www.youtube.com/watch?v=i0o-ui1N35U), [part 2](https://www.youtube.com/watch?v=Csiiv6WGzKM) 6 | * Alternative lecture by John Schulmann (english): [video](https://www.youtube.com/watch?v=IL3gVyJMmhg) 7 | * Definitive guide in policy/value iteration from Sutton: start from page 81 [here](http://incompleteideas.net/sutton/book/bookdraft2017june19.pdf). 8 | 9 | 10 | ## Materials: planning 11 | * Planning by dynamic programming (D. Silver) - [video](https://www.youtube.com/watch?v=Nd1-UUMVfz4) 12 | * Planning via tree search [videos 2-6 from CS188](https://www.youtube.com/channel/UCHBzJsIcRIVuzzHVYabikTQ) 13 | * Our lecture: 14 | * Slides [part1](https://yadi.sk/i/3PM9zCP33J3ub3) (intro), [part2](https://yadi.sk/i/M03xvZ2y3JMQre) (pomdp) 15 | * [Lecture](https://yadi.sk/i/lOAUu7o13JBHFz) & [seminar](https://yadi.sk/i/bkmjEZrk3JBHGF) 16 | * Monte-carlo tree search 17 | * Udacity video on monte-carlo tree search (first part of a chain) - [video](https://www.youtube.com/watch?v=onBYsen2_eA) 18 | * Reminder: UCB-1 - [slides](https://www.cs.bham.ac.uk/internal/courses/robotics/lectures/ucb1.pdf) 19 | * Monte-carlo tree search step-by-step by J.Levine - [video](https://www.youtube.com/watch?v=UXW2yZndl7U) 20 | * Guide to MCTS (monte-carlo tree search) - [post](http://www.cameronius.com/research/mcts/about/index.html) 21 | * Another guide to MCTS - [url](https://jeffbradberry.com/posts/2015/09/intro-to-monte-carlo-tree-search/) 22 | * Integrating learning and planning (D. Silver) - [video](https://www.youtube.com/watch?v=ItMutbeOHtc) 23 | * Approximating the MCTS optimal actions - 5vision solution for deephack.RL, code by Mikhail Pavlov - [repo](https://github.com/5vision/uct_atari) 24 | 25 | 26 | 27 | ## Homework description: 28 | 29 | The main assignment is `seminar1_VI.ipynb` notebook in this week's folder. 30 | 31 | If you're interested in model-based RL at scale, go through __Materials: planning__ section and proceed with `seminar2_MCTS.ipynb` notebook. 32 | -------------------------------------------------------------------------------- /week3_model_free/README.md: -------------------------------------------------------------------------------- 1 | #### __Lecture slides__ - [here](https://yadi.sk/i/54qWKtDB3NDeuh) 2 | ### Materials 3 | * Russian materials: 4 | - Lecture - [video](https://yadi.sk/i/jcQ1Bg8n3SrhuQ) 5 | - Q-learning seminar - [video](https://yadi.sk/i/dQmolwOy3EtGNK) 6 | - Sarsa & stuff - [seminar2](https://yadi.sk/i/XbqNQmjm3ExNsq) 7 | * English materials: 8 | - Lecture by David Silver (english) - [video part I](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [video part II](https://www.youtube.com/watch?v=0g4j2k_Ggc4&t=43s) 9 | - Alternative lecture by Pieter Abbeel (english) - [video](https://www.youtube.com/watch?v=ifma8G7LegE) 10 | - Alternative lecture by John Schulmann (english) - [video](https://www.youtube.com/watch?v=IL3gVyJMmhg) 11 | - Blog post on q-learning Vs SARSA - [url](https://studywolf.wordpress.com/2013/07/01/reinforcement-learning-sarsa-vs-q-learning/) 12 | 13 | ### More materials 14 | * N-step temporal difference from Sutton's book - [suttonbook](http://incompleteideas.net/book/bookdraft2018jan1.pdf) __chapter 7__ 15 | * Eligibility traces from Sutton's book - [suttonbook](http://incompleteideas.net/book/bookdraft2018jan1.pdf) __chapter 12__ 16 | * Blog post on eligibility traces - [url](http://pierrelucbacon.com/traces/) 17 | 18 | ### Assignments 19 | 20 | This week's practice will require you to pick __one of__ `./seminar_main` and `./seminar_alternative` as first part. 21 | 22 | Then `./homework` and follow instructions in `./homework/homework.ipynb` 23 | 24 | Below are some guidelines on what to do in seminar_main/_alternative. 25 | 26 | ### ./seminar_main 27 | _this assignment borrows code from awesome [cs188](http://ai.berkeley.edu/project_overview.html)_ 28 | This homework assignment works on __python2 only__. If you stick to py3, consider seminar_alternative. Or just install it for this homework alone and remove afterwards. 29 | 30 | This homework also requires some physical display (e.g. laptop monitor). It won't work on binder VM / headless server. Please run it on laptop or consider ./seminar_alternative 31 | 32 | 33 | * You need to implement **QLearining** algorithm. If you're running go to ```seminar_main/``` folder and open file ```qlearningAgent.py```. 34 | 35 | Once you're done, run use those commands: 36 | ``` 37 | python crawler.py # Crawler with qlearning 38 | python pacman.py -p -x -n -l 39 | python pacman.py -p PacmanQAgent -x 5000 -n 5010 -l smallGrid # example 40 | ``` 41 | * Make sure you can tune agent to beat ./run_crawler.sh 42 | * on windows, just run `python crawler.py` from cmd in the project directory 43 | * other ./run* files are mostly for your amusement. 44 | * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/master/week3/seminar_main/run_pacman.sh) 45 | * on windows, just copy the type `python pacman.py -p PacmanQAgent -x 2000 -n 2010 -l smallGrid` in cmd from assignemnt dir 46 | (YSDA/HSE) Please submit only qlearningAgents.py file and include a brief text report as comments in it. 47 | 48 | ### ./seminar_alternative 49 | 50 | You'll have to implement qlearning.py just like in main seminar, but in ./seminar_alternative folder. After you're done with it, open the seminar notebook and follow instructions from there. 51 | 52 | -------------------------------------------------------------------------------- /week3_model_free/homework/expected_value_sarsa.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import random, math 3 | import numpy as np 4 | 5 | class EVSarsaAgent: 6 | def __init__(self, alpha, epsilon, discount, get_legal_actions): 7 | """ 8 | Expected Value SARSA Agent. 9 | 10 | The two main methods are 11 | - self.getAction(state) - returns agent's action in that state 12 | - self.update(state,action,nextState,reward) - returns agent's next action 13 | 14 | Instance variables you have access to 15 | - self.epsilon (exploration prob) 16 | - self.alpha (learning rate) 17 | - self.discount (discount rate aka gamma) 18 | 19 | """ 20 | 21 | self.get_legal_actions = get_legal_actions 22 | self._qvalues = defaultdict(lambda: defaultdict(lambda: 0)) 23 | self.alpha = alpha 24 | self.epsilon = epsilon 25 | self.discount = discount 26 | 27 | def get_qvalue(self, state, action): 28 | """ Returns Q(state,action) """ 29 | return self._qvalues[state][action] 30 | 31 | def set_qvalue(self,state,action,value): 32 | """ Sets the Qvalue for [state,action] to the given value """ 33 | self._qvalues[state][action] = value 34 | 35 | #---------------------START OF YOUR CODE---------------------# 36 | 37 | def get_value(self, state): 38 | """ 39 | Returns Vpi for current state under epsilon-greedy policy: 40 | V_{pi}(s) = sum _{over a_i} {pi(a_i | s) * Q(s, a_i)} 41 | 42 | Hint: all other methods from QLearningAgent are still accessible. 43 | """ 44 | epsilon = self.epsilon 45 | possible_actions = self.get_legal_actions(state) 46 | 47 | #If there are no legal actions, return 0.0 48 | if len(possible_actions) == 0: 49 | return 0.0 50 | 51 | 52 | # 53 | possible_values = [self.get_qvalue(state,action) for action in possible_actions] 54 | index = np.argmax(possible_values) 55 | state_value = epsilon * possible_values[index] + (1 - epsilon)*(np.sum(possible_values))/len(possible_actions) 56 | 57 | return state_value 58 | 59 | def update(self, state, action, reward, next_state): 60 | """ 61 | You should do your Q-Value update here: 62 | Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s')) 63 | """ 64 | 65 | #agent parameters 66 | gamma = self.discount 67 | learning_rate = self.alpha 68 | 69 | # 70 | q_value = (1-learning_rate)*self.get_qvalue(state,action) + learning_rate*(reward + gamma*self.get_value(next_state)) 71 | 72 | self.set_qvalue(state, action, q_value) 73 | 74 | 75 | def get_best_action(self, state): 76 | """ 77 | Compute the best action to take in a state (using current q-values). 78 | """ 79 | possible_actions = self.get_legal_actions(state) 80 | 81 | #If there are no legal actions, return None 82 | if len(possible_actions) == 0: 83 | return None 84 | 85 | possible_q_values = [self.get_qvalue(state,action) for action in possible_actions] 86 | index = np.argmax(possible_q_values) 87 | best_action = possible_actions[index] 88 | 89 | return best_action 90 | 91 | def get_action(self, state): 92 | """ 93 | Compute the action to take in the current state, including exploration. 94 | With probability self.epsilon, we should take a random action. 95 | otherwise - the best policy action (self.getPolicy). 96 | 97 | Note: To pick randomly from a list, use random.choice(list). 98 | To pick True or False with a given probablity, generate uniform number in [0, 1] 99 | and compare it with your probability 100 | """ 101 | 102 | # Pick Action 103 | possible_actions = self.get_legal_actions(state) 104 | action = None 105 | 106 | #If there are no legal actions, return None 107 | if len(possible_actions) == 0: 108 | return None 109 | 110 | #agent parameters: 111 | epsilon = self.epsilon 112 | 113 | # 114 | choice = np.random.random() > epsilon 115 | 116 | if choice: 117 | chosen_action = self.get_best_action(state) 118 | else: 119 | chosen_action = random.choice(possible_actions) 120 | 121 | return chosen_action -------------------------------------------------------------------------------- /week3_model_free/homework/expected_value_sarsa_epsilon_annealing.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import random, math 3 | import numpy as np 4 | 5 | class EVSarsaAgent: 6 | def __init__(self, alpha, epsilon, discount, get_legal_actions): 7 | """ 8 | Expected Value SARSA Agent. 9 | 10 | The two main methods are 11 | - self.getAction(state) - returns agent's action in that state 12 | - self.update(state,action,nextState,reward) - returns agent's next action 13 | 14 | Instance variables you have access to 15 | - self.epsilon (exploration prob) 16 | - self.alpha (learning rate) 17 | - self.discount (discount rate aka gamma) 18 | 19 | """ 20 | 21 | self.get_legal_actions = get_legal_actions 22 | self._qvalues = defaultdict(lambda: defaultdict(lambda: 0)) 23 | self.alpha = alpha 24 | self.epsilon = epsilon 25 | self.discount = discount 26 | 27 | def get_qvalue(self, state, action): 28 | """ Returns Q(state,action) """ 29 | return self._qvalues[state][action] 30 | 31 | def set_qvalue(self,state,action,value): 32 | """ Sets the Qvalue for [state,action] to the given value """ 33 | self._qvalues[state][action] = value 34 | 35 | #---------------------START OF YOUR CODE---------------------# 36 | 37 | def get_value(self, state): 38 | """ 39 | Returns Vpi for current state under epsilon-greedy policy: 40 | V_{pi}(s) = sum _{over a_i} {pi(a_i | s) * Q(s, a_i)} 41 | 42 | Hint: all other methods from QLearningAgent are still accessible. 43 | """ 44 | epsilon = self.epsilon 45 | possible_actions = self.get_legal_actions(state) 46 | 47 | #If there are no legal actions, return 0.0 48 | if len(possible_actions) == 0: 49 | return 0.0 50 | 51 | 52 | # 53 | possible_values = [self.get_qvalue(state,action) for action in possible_actions] 54 | index = np.argmax(possible_values) 55 | state_value = epsilon * possible_values[index] + (1 - epsilon)*(np.sum(possible_values))/len(possible_actions) 56 | 57 | return state_value 58 | 59 | def update(self, state, action, reward, next_state): 60 | """ 61 | You should do your Q-Value update here: 62 | Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s')) 63 | """ 64 | 65 | #agent parameters 66 | gamma = self.discount 67 | learning_rate = self.alpha 68 | 69 | # 70 | q_value = (1-learning_rate)*self.get_qvalue(state,action) + learning_rate*(reward + gamma*self.get_value(next_state)) 71 | 72 | self.set_qvalue(state, action, q_value) 73 | 74 | 75 | def get_best_action(self, state): 76 | """ 77 | Compute the best action to take in a state (using current q-values). 78 | """ 79 | possible_actions = self.get_legal_actions(state) 80 | 81 | #If there are no legal actions, return None 82 | if len(possible_actions) == 0: 83 | return None 84 | 85 | possible_q_values = [self.get_qvalue(state,action) for action in possible_actions] 86 | index = np.argmax(possible_q_values) 87 | best_action = possible_actions[index] 88 | 89 | return best_action 90 | 91 | def get_action(self, state): 92 | """ 93 | Compute the action to take in the current state, including exploration. 94 | With probability self.epsilon, we should take a random action. 95 | otherwise - the best policy action (self.getPolicy). 96 | 97 | Note: To pick randomly from a list, use random.choice(list). 98 | To pick True or False with a given probablity, generate uniform number in [0, 1] 99 | and compare it with your probability 100 | """ 101 | 102 | # Pick Action 103 | possible_actions = self.get_legal_actions(state) 104 | action = None 105 | 106 | #If there are no legal actions, return None 107 | if len(possible_actions) == 0: 108 | return None 109 | 110 | #agent parameters: 111 | epsilon = self.epsilon 112 | self.epsilon = 0.99*epsilon 113 | 114 | # 115 | choice = np.random.random() > epsilon 116 | 117 | if choice: 118 | chosen_action = self.get_best_action(state) 119 | else: 120 | chosen_action = random.choice(possible_actions) 121 | 122 | return chosen_action -------------------------------------------------------------------------------- /week3_model_free/homework/q_learning_epsilon_annealing.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import random, math 3 | import numpy as np 4 | 5 | class QLearningAgent: 6 | def __init__(self, alpha, epsilon, discount, get_legal_actions): 7 | """ 8 | Q-Learning Agent 9 | based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 10 | Instance variables you have access to 11 | - self.epsilon (exploration prob) 12 | - self.alpha (learning rate) 13 | - self.discount (discount rate aka gamma) 14 | 15 | Functions you should use 16 | - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable} 17 | which returns legal actions for a state 18 | - self.get_qvalue(state,action) 19 | which returns Q(state,action) 20 | - self.set_qvalue(state,action,value) 21 | which sets Q(state,action) := value 22 | 23 | !!!Important!!! 24 | Note: please avoid using self._qValues directly. 25 | There's a special self.get_qvalue/set_qvalue for that. 26 | """ 27 | 28 | self.get_legal_actions = get_legal_actions 29 | self._qvalues = defaultdict(lambda: defaultdict(lambda: 0)) 30 | self.alpha = alpha 31 | self.epsilon = epsilon 32 | self.discount = discount 33 | 34 | def get_qvalue(self, state, action): 35 | """ Returns Q(state,action) """ 36 | return self._qvalues[state][action] 37 | 38 | def set_qvalue(self,state,action,value): 39 | """ Sets the Qvalue for [state,action] to the given value """ 40 | self._qvalues[state][action] = value 41 | 42 | #---------------------START OF YOUR CODE---------------------# 43 | 44 | def get_value(self, state): 45 | """ 46 | Compute your agent's estimate of V(s) using current q-values 47 | V(s) = max_over_action Q(state,action) over possible actions. 48 | Note: please take into account that q-values can be negative. 49 | """ 50 | possible_actions = self.get_legal_actions(state) 51 | 52 | #If there are no legal actions, return 0.0 53 | if len(possible_actions) == 0: 54 | return 0.0 55 | 56 | 57 | # 58 | possible_values = [self.get_qvalue(state,action) for action in possible_actions] 59 | state_value = np.max(possible_values) 60 | 61 | return state_value 62 | 63 | def update(self, state, action, reward, next_state): 64 | """ 65 | You should do your Q-Value update here: 66 | Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s')) 67 | """ 68 | 69 | #agent parameters 70 | gamma = self.discount 71 | learning_rate = self.alpha 72 | 73 | # 74 | q_value = (1-learning_rate)*self.get_qvalue(state,action) + learning_rate*(reward + gamma*self.get_value(next_state)) 75 | 76 | self.set_qvalue(state, action, q_value) 77 | 78 | 79 | def get_best_action(self, state): 80 | """ 81 | Compute the best action to take in a state (using current q-values). 82 | """ 83 | possible_actions = self.get_legal_actions(state) 84 | 85 | #If there are no legal actions, return None 86 | if len(possible_actions) == 0: 87 | return None 88 | 89 | possible_q_values = [self.get_qvalue(state,action) for action in possible_actions] 90 | index = np.argmax(possible_q_values) 91 | best_action = possible_actions[index] 92 | 93 | return best_action 94 | 95 | def get_action(self, state): 96 | """ 97 | Compute the action to take in the current state, including exploration. 98 | With probability self.epsilon, we should take a random action. 99 | otherwise - the best policy action (self.getPolicy). 100 | 101 | Note: To pick randomly from a list, use random.choice(list). 102 | To pick True or False with a given probablity, generate uniform number in [0, 1] 103 | and compare it with your probability 104 | """ 105 | 106 | # Pick Action 107 | possible_actions = self.get_legal_actions(state) 108 | action = None 109 | 110 | #If there are no legal actions, return None 111 | if len(possible_actions) == 0: 112 | return None 113 | 114 | #agent parameters: 115 | epsilon = self.epsilon 116 | self.epsilon = 0.99 * epsilon 117 | 118 | # 119 | choice = np.random.random() > epsilon 120 | 121 | if choice: 122 | chosen_action = self.get_best_action(state) 123 | else: 124 | chosen_action = random.choice(possible_actions) 125 | 126 | return chosen_action -------------------------------------------------------------------------------- /week3_model_free/homework/qlearning.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import random, math 3 | import numpy as np 4 | 5 | class QLearningAgent: 6 | def __init__(self, alpha, epsilon, discount, get_legal_actions): 7 | """ 8 | Q-Learning Agent 9 | based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 10 | Instance variables you have access to 11 | - self.epsilon (exploration prob) 12 | - self.alpha (learning rate) 13 | - self.discount (discount rate aka gamma) 14 | 15 | Functions you should use 16 | - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable} 17 | which returns legal actions for a state 18 | - self.get_qvalue(state,action) 19 | which returns Q(state,action) 20 | - self.set_qvalue(state,action,value) 21 | which sets Q(state,action) := value 22 | 23 | !!!Important!!! 24 | Note: please avoid using self._qValues directly. 25 | There's a special self.get_qvalue/set_qvalue for that. 26 | """ 27 | 28 | self.get_legal_actions = get_legal_actions 29 | self._qvalues = defaultdict(lambda: defaultdict(lambda: 0)) 30 | self.alpha = alpha 31 | self.epsilon = epsilon 32 | self.discount = discount 33 | 34 | def get_qvalue(self, state, action): 35 | """ Returns Q(state,action) """ 36 | return self._qvalues[state][action] 37 | 38 | def set_qvalue(self,state,action,value): 39 | """ Sets the Qvalue for [state,action] to the given value """ 40 | self._qvalues[state][action] = value 41 | 42 | #---------------------START OF YOUR CODE---------------------# 43 | 44 | def get_value(self, state): 45 | """ 46 | Compute your agent's estimate of V(s) using current q-values 47 | V(s) = max_over_action Q(state,action) over possible actions. 48 | Note: please take into account that q-values can be negative. 49 | """ 50 | possible_actions = self.get_legal_actions(state) 51 | 52 | #If there are no legal actions, return 0.0 53 | if len(possible_actions) == 0: 54 | return 0.0 55 | 56 | 57 | # 58 | possible_values = [self.get_qvalue(state,action) for action in possible_actions] 59 | state_value = np.max(possible_values) 60 | 61 | return state_value 62 | 63 | def update(self, state, action, reward, next_state): 64 | """ 65 | You should do your Q-Value update here: 66 | Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s')) 67 | """ 68 | 69 | #agent parameters 70 | gamma = self.discount 71 | learning_rate = self.alpha 72 | 73 | # 74 | q_value = (1-learning_rate)*self.get_qvalue(state,action) + learning_rate*(reward + gamma*self.get_value(next_state)) 75 | 76 | self.set_qvalue(state, action, q_value) 77 | 78 | 79 | def get_best_action(self, state): 80 | """ 81 | Compute the best action to take in a state (using current q-values). 82 | """ 83 | possible_actions = self.get_legal_actions(state) 84 | 85 | #If there are no legal actions, return None 86 | if len(possible_actions) == 0: 87 | return None 88 | 89 | possible_q_values = [self.get_qvalue(state,action) for action in possible_actions] 90 | index = np.argmax(possible_q_values) 91 | best_action = possible_actions[index] 92 | 93 | return best_action 94 | 95 | def get_action(self, state): 96 | """ 97 | Compute the action to take in the current state, including exploration. 98 | With probability self.epsilon, we should take a random action. 99 | otherwise - the best policy action (self.getPolicy). 100 | 101 | Note: To pick randomly from a list, use random.choice(list). 102 | To pick True or False with a given probablity, generate uniform number in [0, 1] 103 | and compare it with your probability 104 | """ 105 | 106 | # Pick Action 107 | possible_actions = self.get_legal_actions(state) 108 | action = None 109 | 110 | #If there are no legal actions, return None 111 | if len(possible_actions) == 0: 112 | return None 113 | 114 | #agent parameters: 115 | epsilon = self.epsilon 116 | 117 | # 118 | choice = np.random.random() > epsilon 119 | 120 | if choice: 121 | chosen_action = self.get_best_action(state) 122 | else: 123 | chosen_action = random.choice(possible_actions) 124 | 125 | return chosen_action -------------------------------------------------------------------------------- /week3_model_free/homework/sarsa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Expected Value SARSA 3 | This file builds upon the same functions as Q-learning agent (qlearning.py). 4 | 5 | [assignment] 6 | The only thing you must implement is the getValue method. 7 | - Recall that V(s) in SARSA is not the maximal but the expected Q-value. 8 | - The expectation should be done under agent's policy (e-greedy). 9 | 10 | 11 | Here's usage example: 12 | >>>from sarsa import SarsaAgent 13 | 14 | >>>agent = SarsaAgent(alpha=0.1,epsilon=0.25,discount=0.99, 15 | getLegalActions = lambda s: actions_from_that_state) 16 | >>>action = agent.getAction(state) 17 | >>>agent.update(state,action, next_state,reward) 18 | >>>agent.epsilon *= 0.99 19 | """ 20 | import random,math 21 | 22 | import numpy as np 23 | from collections import defaultdict 24 | 25 | class SarsaAgent(): 26 | """ 27 | Classical SARSA agent. 28 | 29 | The two main methods are 30 | - self.getAction(state) - returns agent's action in that state 31 | - self.update(state,action,reward,nextState,nextAction) - returns agent's next action 32 | 33 | Instance variables you have access to 34 | - self.epsilon (exploration prob) 35 | - self.alpha (learning rate) 36 | - self.discount (discount rate aka gamma) 37 | 38 | """ 39 | def __init__(self,alpha,epsilon,discount,getLegalActions): 40 | "We initialize agent and Q-values here." 41 | self.getLegalActions= getLegalActions 42 | self._qValues = defaultdict(lambda:defaultdict(lambda:0)) 43 | self.alpha = alpha 44 | self.epsilon = epsilon 45 | self.discount = discount 46 | 47 | def getQValue(self, state, action): 48 | """ 49 | Returns Q(state,action) 50 | """ 51 | return self._qValues[state][action] 52 | 53 | def setQValue(self,state,action,value): 54 | """ 55 | Sets the Qvalue for [state,action] to the given value 56 | """ 57 | self._qValues[state][action] = value 58 | 59 | #---------------------#start of your code#---------------------# 60 | 61 | def getPolicy(self, state): 62 | """ 63 | Compute the best action to take in a state. 64 | 65 | """ 66 | possibleActions = self.getLegalActions(state) 67 | 68 | #If there are no legal actions, return None 69 | if len(possibleActions) == 0: 70 | return None 71 | 72 | best_action = None 73 | 74 | "*** this code works exactly as Q-learning ***" 75 | best_action = possibleActions[np.argmax([self.getQValue(state, a) for a in possibleActions])] 76 | return best_action 77 | 78 | def getAction(self, state): 79 | """ 80 | Compute the action to take in the current state, including exploration. 81 | 82 | With probability self.epsilon, we should take a random action. 83 | otherwise - the best policy action (self.getPolicy). 84 | 85 | HINT: You might want to use util.flipCoin(prob) 86 | HINT: To pick randomly from a list, use random.choice(list) 87 | 88 | """ 89 | 90 | # Pick Action 91 | possibleActions = self.getLegalActions(state) 92 | action = None 93 | 94 | #If there are no legal actions, return None 95 | if len(possibleActions) == 0: 96 | return None 97 | 98 | #agent parameters: 99 | epsilon = self.epsilon 100 | 101 | "*** Epsilon-greedy strategy exactly as Q-learning ***" 102 | if np.random.random()<=epsilon: 103 | return random.choice(possibleActions) 104 | else: 105 | action = self.getPolicy(state) 106 | return action 107 | 108 | def update(self, state, action, nextState,nextAction, reward): 109 | """ 110 | You should do your Q-Value update here 111 | 112 | NOTE: You should never call this function, 113 | it will be called on your behalf 114 | 115 | 116 | """ 117 | #agent parameters 118 | gamma = self.discount 119 | learning_rate = self.alpha 120 | 121 | "*** YOUR CODE HERE ***" 122 | reference_qvalue = 123 | 124 | updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue 125 | 126 | self.setQValue(state,action,updated_qvalue) 127 | 128 | 129 | #---------------------#end of your code#---------------------# 130 | 131 | 132 | -------------------------------------------------------------------------------- /week3_model_free/seminar_alternative/qlearning.py: -------------------------------------------------------------------------------- 1 | # qlearningAgents.py 2 | # ------------------ 3 | ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 4 | 5 | import random,math 6 | 7 | import numpy as np 8 | from collections import defaultdict 9 | 10 | class QLearningAgent(): 11 | """ 12 | Q-Learning Agent 13 | 14 | Instance variables you have access to 15 | - self.epsilon (exploration prob) 16 | - self.alpha (learning rate) 17 | - self.discount (discount rate aka gamma) 18 | 19 | Functions you should use 20 | - self.getLegalActions(state) 21 | which returns legal actions for a state 22 | - self.getQValue(state,action) 23 | which returns Q(state,action) 24 | - self.setQValue(state,action,value) 25 | which sets Q(state,action) := value 26 | 27 | !!!Important!!! 28 | NOTE: please avoid using self._qValues directly to make code cleaner 29 | """ 30 | def __init__(self,alpha,epsilon,discount,getLegalActions): 31 | "We initialize agent and Q-values here." 32 | self.getLegalActions= getLegalActions 33 | self._qValues = defaultdict(lambda:defaultdict(lambda:0)) 34 | self.alpha = alpha 35 | self.epsilon = epsilon 36 | self.discount = discount 37 | 38 | def getQValue(self, state, action): 39 | """ 40 | Returns Q(state,action) 41 | """ 42 | return self._qValues[state][action] 43 | 44 | def setQValue(self,state,action,value): 45 | """ 46 | Sets the Qvalue for [state,action] to the given value 47 | """ 48 | self._qValues[state][action] = value 49 | 50 | #---------------------#start of your code#---------------------# 51 | 52 | def getValue(self, state): 53 | """ 54 | Returns max_action Q(state,action) 55 | where the max is over legal actions. 56 | """ 57 | 58 | possibleActions = self.getLegalActions(state) 59 | #If there are no legal actions, return 0.0 60 | if len(possibleActions) == 0: 61 | return 0.0 62 | 63 | "*** YOUR CODE HERE ***" 64 | return 65 | 66 | def getPolicy(self, state): 67 | """ 68 | Compute the best action to take in a state. 69 | 70 | """ 71 | possibleActions = self.getLegalActions(state) 72 | 73 | #If there are no legal actions, return None 74 | if len(possibleActions) == 0: 75 | return None 76 | 77 | best_action = None 78 | 79 | "*** YOUR CODE HERE ***" 80 | best_action = 81 | return best_action 82 | 83 | def getAction(self, state): 84 | """ 85 | Compute the action to take in the current state, including exploration. 86 | 87 | With probability self.epsilon, we should take a random action. 88 | otherwise - the best policy action (self.getPolicy). 89 | 90 | HINT: You might want to use util.flipCoin(prob) 91 | HINT: To pick randomly from a list, use random.choice(list) 92 | 93 | """ 94 | 95 | # Pick Action 96 | possibleActions = self.getLegalActions(state) 97 | action = None 98 | 99 | #If there are no legal actions, return None 100 | if len(possibleActions) == 0: 101 | return None 102 | 103 | #agent parameters: 104 | epsilon = self.epsilon 105 | 106 | "*** YOUR CODE HERE ***" 107 | 108 | return 109 | 110 | def update(self, state, action, nextState, reward): 111 | """ 112 | You should do your Q-Value update here 113 | 114 | NOTE: You should never call this function, 115 | it will be called on your behalf 116 | 117 | 118 | """ 119 | #agent parameters 120 | gamma = self.discount 121 | learning_rate = self.alpha 122 | 123 | "*** YOUR CODE HERE ***" 124 | reference_qvalue = 125 | 126 | updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue 127 | self.setQValue(state,action,updated_qvalue) 128 | 129 | 130 | #---------------------#end of your code#---------------------# 131 | 132 | 133 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/analysis.py: -------------------------------------------------------------------------------- 1 | # analysis.py 2 | # ----------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | ###################### 10 | # ANALYSIS QUESTIONS # 11 | ###################### 12 | 13 | # Change these default values to obtain the specified policies through 14 | # value iteration. 15 | 16 | def question2a(): 17 | answerDiscount = 0.9 18 | answerNoise = 0.2 19 | answerLivingReward = 0.0 20 | return answerDiscount, answerNoise, answerLivingReward 21 | # If not possible, return 'NOT POSSIBLE' 22 | 23 | def question2b(): 24 | answerDiscount = 0.9 25 | answerNoise = 0.2 26 | answerLivingReward = 0.0 27 | return answerDiscount, answerNoise, answerLivingReward 28 | # If not possible, return 'NOT POSSIBLE' 29 | 30 | def question2c(): 31 | answerDiscount = 0.9 32 | answerNoise = 0.2 33 | answerLivingReward = 0.0 34 | return answerDiscount, answerNoise, answerLivingReward 35 | # If not possible, return 'NOT POSSIBLE' 36 | 37 | def question2d(): 38 | answerDiscount = 0.9 39 | answerNoise = 0.2 40 | answerLivingReward = 0.0 41 | return answerDiscount, answerNoise, answerLivingReward 42 | # If not possible, return 'NOT POSSIBLE' 43 | 44 | def question2e(): 45 | answerDiscount = 0.9 46 | answerNoise = 0.2 47 | answerLivingReward = 0.0 48 | return answerDiscount, answerNoise, answerLivingReward 49 | # If not possible, return 'NOT POSSIBLE' 50 | 51 | if __name__ == '__main__': 52 | print 'Answers to analysis questions:' 53 | import analysis 54 | for q in [q for q in dir(analysis) if q.startswith('question')]: 55 | response = getattr(analysis, q)() 56 | print ' Question %s:\t%s' % (q, str(response)) 57 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/environment.py: -------------------------------------------------------------------------------- 1 | # environment.py 2 | # -------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | #!/usr/bin/python 10 | 11 | class Environment: 12 | 13 | def getCurrentState(self): 14 | """ 15 | Returns the current state of enviornment 16 | """ 17 | abstract 18 | 19 | def getPossibleActions(self, state): 20 | """ 21 | Returns possible actions the agent 22 | can take in the given state. Can 23 | return the empty list if we are in 24 | a terminal state. 25 | """ 26 | abstract 27 | 28 | def doAction(self, action): 29 | """ 30 | Performs the given action in the current 31 | environment state and updates the enviornment. 32 | 33 | Returns a (reward, nextState) pair 34 | """ 35 | abstract 36 | 37 | def reset(self): 38 | """ 39 | Resets the current state to the start state 40 | """ 41 | abstract 42 | 43 | def isTerminal(self): 44 | """ 45 | Has the enviornment entered a terminal 46 | state? This means there are no successors 47 | """ 48 | state = self.getCurrentState() 49 | actions = self.getPossibleActions(state) 50 | return len(actions) == 0 51 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/featureExtractors.py: -------------------------------------------------------------------------------- 1 | # featureExtractors.py 2 | # -------------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | "Feature extractors for Pacman game states" 10 | 11 | from game import Directions, Actions 12 | import util 13 | 14 | class FeatureExtractor: 15 | def getFeatures(self, state, action): 16 | """ 17 | Returns a dict from features to counts 18 | Usually, the count will just be 1.0 for 19 | indicator functions. 20 | """ 21 | util.raiseNotDefined() 22 | 23 | class IdentityExtractor(FeatureExtractor): 24 | def getFeatures(self, state, action): 25 | feats = util.Counter() 26 | feats[(state,action)] = 1.0 27 | return feats 28 | 29 | def closestFood(pos, food, walls): 30 | """ 31 | closestFood -- this is similar to the function that we have 32 | worked on in the search project; here its all in one place 33 | """ 34 | fringe = [(pos[0], pos[1], 0)] 35 | expanded = set() 36 | while fringe: 37 | pos_x, pos_y, dist = fringe.pop(0) 38 | if (pos_x, pos_y) in expanded: 39 | continue 40 | expanded.add((pos_x, pos_y)) 41 | # if we find a food at this location then exit 42 | if food[pos_x][pos_y]: 43 | return dist 44 | # otherwise spread out from the location to its neighbours 45 | nbrs = Actions.getLegalNeighbors((pos_x, pos_y), walls) 46 | for nbr_x, nbr_y in nbrs: 47 | fringe.append((nbr_x, nbr_y, dist+1)) 48 | # no food found 49 | return None 50 | 51 | class SimpleExtractor(FeatureExtractor): 52 | """ 53 | Returns simple features for a basic reflex Pacman: 54 | - whether food will be eaten 55 | - how far away the next food is 56 | - whether a ghost collision is imminent 57 | - whether a ghost is one step away 58 | """ 59 | 60 | def getFeatures(self, state, action): 61 | # extract the grid of food and wall locations and get the ghost locations 62 | food = state.getFood() 63 | walls = state.getWalls() 64 | ghosts = state.getGhostPositions() 65 | 66 | features = util.Counter() 67 | 68 | features["bias"] = 1.0 69 | 70 | # compute the location of pacman after he takes the action 71 | x, y = state.getPacmanPosition() 72 | dx, dy = Actions.directionToVector(action) 73 | next_x, next_y = int(x + dx), int(y + dy) 74 | 75 | # count the number of ghosts 1-step away 76 | features["#-of-ghosts-1-step-away"] = sum((next_x, next_y) in Actions.getLegalNeighbors(g, walls) for g in ghosts) 77 | 78 | # if there is no danger of ghosts then add the food feature 79 | if not features["#-of-ghosts-1-step-away"] and food[next_x][next_y]: 80 | features["eats-food"] = 1.0 81 | 82 | dist = closestFood((next_x, next_y), food, walls) 83 | if dist is not None: 84 | # make the distance a number less than one otherwise the update 85 | # will diverge wildly 86 | features["closest-food"] = float(dist) / (walls.width * walls.height) 87 | features.divideAll(10.0) 88 | return features -------------------------------------------------------------------------------- /week3_model_free/seminar_main/ghostAgents.py: -------------------------------------------------------------------------------- 1 | # ghostAgents.py 2 | # -------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from game import Agent 10 | from game import Actions 11 | from game import Directions 12 | import random 13 | from util import manhattanDistance 14 | import util 15 | 16 | class GhostAgent( Agent ): 17 | def __init__( self, index ): 18 | self.index = index 19 | 20 | def getAction( self, state ): 21 | dist = self.getDistribution(state) 22 | if len(dist) == 0: 23 | return Directions.STOP 24 | else: 25 | return util.chooseFromDistribution( dist ) 26 | 27 | def getDistribution(self, state): 28 | "Returns a Counter encoding a distribution over actions from the provided state." 29 | util.raiseNotDefined() 30 | 31 | class RandomGhost( GhostAgent ): 32 | "A ghost that chooses a legal action uniformly at random." 33 | def getDistribution( self, state ): 34 | dist = util.Counter() 35 | for a in state.getLegalActions( self.index ): dist[a] = 1.0 36 | dist.normalize() 37 | return dist 38 | 39 | class DirectionalGhost( GhostAgent ): 40 | "A ghost that prefers to rush Pacman, or flee when scared." 41 | def __init__( self, index, prob_attack=0.8, prob_scaredFlee=0.8 ): 42 | self.index = index 43 | self.prob_attack = prob_attack 44 | self.prob_scaredFlee = prob_scaredFlee 45 | 46 | def getDistribution( self, state ): 47 | # Read variables from state 48 | ghostState = state.getGhostState( self.index ) 49 | legalActions = state.getLegalActions( self.index ) 50 | pos = state.getGhostPosition( self.index ) 51 | isScared = ghostState.scaredTimer > 0 52 | 53 | speed = 1 54 | if isScared: speed = 0.5 55 | 56 | actionVectors = [Actions.directionToVector( a, speed ) for a in legalActions] 57 | newPositions = [( pos[0]+a[0], pos[1]+a[1] ) for a in actionVectors] 58 | pacmanPosition = state.getPacmanPosition() 59 | 60 | # Select best actions given the state 61 | distancesToPacman = [manhattanDistance( pos, pacmanPosition ) for pos in newPositions] 62 | if isScared: 63 | bestScore = max( distancesToPacman ) 64 | bestProb = self.prob_scaredFlee 65 | else: 66 | bestScore = min( distancesToPacman ) 67 | bestProb = self.prob_attack 68 | bestActions = [action for action, distance in zip( legalActions, distancesToPacman ) if distance == bestScore] 69 | 70 | # Construct distribution 71 | dist = util.Counter() 72 | for a in bestActions: dist[a] = bestProb / len(bestActions) 73 | for a in legalActions: dist[a] += ( 1-bestProb ) / len(legalActions) 74 | dist.normalize() 75 | return dist 76 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/how2run: -------------------------------------------------------------------------------- 1 | python pacman.py -p PacmanQAgent -x 5000 -n 5010 -l smallGrid 2 | python pacman.py -p PacmanQAgent -x 10000 -n 10010 -l mediumGrid 3 | python pacman.py -p PacmanQAgent -x 100 -n 110 -l mediumClassic 4 | python gridworld.py -a q -k 50 -n 0 -g BridgeGrid -e 1 5 | python crawler.py 6 | 7 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/keyboardAgents.py: -------------------------------------------------------------------------------- 1 | # keyboardAgents.py 2 | # ----------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from game import Agent 10 | from game import Directions 11 | import random 12 | 13 | class KeyboardAgent(Agent): 14 | """ 15 | An agent controlled by the keyboard. 16 | """ 17 | # NOTE: Arrow keys also work. 18 | WEST_KEY = 'a' 19 | EAST_KEY = 'd' 20 | NORTH_KEY = 'w' 21 | SOUTH_KEY = 's' 22 | STOP_KEY = 'q' 23 | 24 | def __init__( self, index = 0 ): 25 | 26 | self.lastMove = Directions.STOP 27 | self.index = index 28 | self.keys = [] 29 | 30 | def getAction( self, state): 31 | from graphicsUtils import keys_waiting 32 | from graphicsUtils import keys_pressed 33 | keys = keys_waiting() + keys_pressed() 34 | if keys != []: 35 | self.keys = keys 36 | 37 | legal = state.getLegalActions(self.index) 38 | move = self.getMove(legal) 39 | 40 | if move == Directions.STOP: 41 | # Try to move in the same direction as before 42 | if self.lastMove in legal: 43 | move = self.lastMove 44 | 45 | if (self.STOP_KEY in self.keys) and Directions.STOP in legal: move = Directions.STOP 46 | 47 | if move not in legal: 48 | move = random.choice(legal) 49 | 50 | self.lastMove = move 51 | return move 52 | 53 | def getMove(self, legal): 54 | move = Directions.STOP 55 | if (self.WEST_KEY in self.keys or 'Left' in self.keys) and Directions.WEST in legal: move = Directions.WEST 56 | if (self.EAST_KEY in self.keys or 'Right' in self.keys) and Directions.EAST in legal: move = Directions.EAST 57 | if (self.NORTH_KEY in self.keys or 'Up' in self.keys) and Directions.NORTH in legal: move = Directions.NORTH 58 | if (self.SOUTH_KEY in self.keys or 'Down' in self.keys) and Directions.SOUTH in legal: move = Directions.SOUTH 59 | return move 60 | 61 | class KeyboardAgent2(KeyboardAgent): 62 | """ 63 | A second agent controlled by the keyboard. 64 | """ 65 | # NOTE: Arrow keys also work. 66 | WEST_KEY = 'j' 67 | EAST_KEY = "l" 68 | NORTH_KEY = 'i' 69 | SOUTH_KEY = 'k' 70 | STOP_KEY = 'u' 71 | 72 | def getMove(self, legal): 73 | move = Directions.STOP 74 | if (self.WEST_KEY in self.keys) and Directions.WEST in legal: move = Directions.WEST 75 | if (self.EAST_KEY in self.keys) and Directions.EAST in legal: move = Directions.EAST 76 | if (self.NORTH_KEY in self.keys) and Directions.NORTH in legal: move = Directions.NORTH 77 | if (self.SOUTH_KEY in self.keys) and Directions.SOUTH in legal: move = Directions.SOUTH 78 | return move 79 | 80 | 81 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layout.py: -------------------------------------------------------------------------------- 1 | # layout.py 2 | # --------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from util import manhattanDistance 10 | from game import Grid 11 | import os 12 | import random 13 | 14 | VISIBILITY_MATRIX_CACHE = {} 15 | 16 | class Layout: 17 | """ 18 | A Layout manages the static information about the game board. 19 | """ 20 | 21 | def __init__(self, layoutText): 22 | self.width = len(layoutText[0]) 23 | self.height= len(layoutText) 24 | self.walls = Grid(self.width, self.height, False) 25 | self.food = Grid(self.width, self.height, False) 26 | self.capsules = [] 27 | self.agentPositions = [] 28 | self.numGhosts = 0 29 | self.processLayoutText(layoutText) 30 | self.layoutText = layoutText 31 | # self.initializeVisibilityMatrix() 32 | 33 | def getNumGhosts(self): 34 | return self.numGhosts 35 | 36 | def initializeVisibilityMatrix(self): 37 | global VISIBILITY_MATRIX_CACHE 38 | if reduce(str.__add__, self.layoutText) not in VISIBILITY_MATRIX_CACHE: 39 | from game import Directions 40 | vecs = [(-0.5,0), (0.5,0),(0,-0.5),(0,0.5)] 41 | dirs = [Directions.NORTH, Directions.SOUTH, Directions.WEST, Directions.EAST] 42 | vis = Grid(self.width, self.height, {Directions.NORTH:set(), Directions.SOUTH:set(), Directions.EAST:set(), Directions.WEST:set(), Directions.STOP:set()}) 43 | for x in range(self.width): 44 | for y in range(self.height): 45 | if self.walls[x][y] == False: 46 | for vec, direction in zip(vecs, dirs): 47 | dx, dy = vec 48 | nextx, nexty = x + dx, y + dy 49 | while (nextx + nexty) != int(nextx) + int(nexty) or not self.walls[int(nextx)][int(nexty)] : 50 | vis[x][y][direction].add((nextx, nexty)) 51 | nextx, nexty = x + dx, y + dy 52 | self.visibility = vis 53 | VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] = vis 54 | else: 55 | self.visibility = VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] 56 | 57 | def isWall(self, pos): 58 | x, col = pos 59 | return self.walls[x][col] 60 | 61 | def getRandomLegalPosition(self): 62 | x = random.choice(range(self.width)) 63 | y = random.choice(range(self.height)) 64 | while self.isWall( (x, y) ): 65 | x = random.choice(range(self.width)) 66 | y = random.choice(range(self.height)) 67 | return (x,y) 68 | 69 | def getRandomCorner(self): 70 | poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)] 71 | return random.choice(poses) 72 | 73 | def getFurthestCorner(self, pacPos): 74 | poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)] 75 | dist, pos = max([(manhattanDistance(p, pacPos), p) for p in poses]) 76 | return pos 77 | 78 | def isVisibleFrom(self, ghostPos, pacPos, pacDirection): 79 | row, col = [int(x) for x in pacPos] 80 | return ghostPos in self.visibility[row][col][pacDirection] 81 | 82 | def __str__(self): 83 | return "\n".join(self.layoutText) 84 | 85 | def deepCopy(self): 86 | return Layout(self.layoutText[:]) 87 | 88 | def processLayoutText(self, layoutText): 89 | """ 90 | Coordinates are flipped from the input format to the (x,y) convention here 91 | 92 | The shape of the maze. Each character 93 | represents a different type of object. 94 | % - Wall 95 | . - Food 96 | o - Capsule 97 | G - Ghost 98 | P - Pacman 99 | Other characters are ignored. 100 | """ 101 | maxY = self.height - 1 102 | for y in range(self.height): 103 | for x in range(self.width): 104 | layoutChar = layoutText[maxY - y][x] 105 | self.processLayoutChar(x, y, layoutChar) 106 | self.agentPositions.sort() 107 | self.agentPositions = [ ( i == 0, pos) for i, pos in self.agentPositions] 108 | 109 | def processLayoutChar(self, x, y, layoutChar): 110 | if layoutChar == '%': 111 | self.walls[x][y] = True 112 | elif layoutChar == '.': 113 | self.food[x][y] = True 114 | elif layoutChar == 'o': 115 | self.capsules.append((x, y)) 116 | elif layoutChar == 'P': 117 | self.agentPositions.append( (0, (x, y) ) ) 118 | elif layoutChar in ['G']: 119 | self.agentPositions.append( (1, (x, y) ) ) 120 | self.numGhosts += 1 121 | elif layoutChar in ['1', '2', '3', '4']: 122 | self.agentPositions.append( (int(layoutChar), (x,y))) 123 | self.numGhosts += 1 124 | def getLayout(name, back = 2): 125 | if name.endswith('.lay'): 126 | layout = tryToLoad('layouts/' + name) 127 | if layout == None: layout = tryToLoad(name) 128 | else: 129 | layout = tryToLoad('layouts/' + name + '.lay') 130 | if layout == None: layout = tryToLoad(name + '.lay') 131 | if layout == None and back >= 0: 132 | curdir = os.path.abspath('.') 133 | os.chdir('..') 134 | layout = getLayout(name, back -1) 135 | os.chdir(curdir) 136 | return layout 137 | 138 | def tryToLoad(fullname): 139 | if(not os.path.exists(fullname)): return None 140 | f = open(fullname) 141 | try: return Layout([line.strip() for line in f]) 142 | finally: f.close() -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/capsuleClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%% 2 | %G. G ....% 3 | %.% % %%%%%% %.%%.% 4 | %.%o% % o% %.o%.% 5 | %.%%%.% %%% %..%.% 6 | %..... P %..%G% 7 | %%%%%%%%%%%%%%%%%%%% 8 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/contestClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%% 2 | %o...%........%...o% 3 | %.%%.%.%%..%%.%.%%.% 4 | %...... G GG%......% 5 | %.%.%%.%% %%%.%%.%.% 6 | %.%....% ooo%.%..%.% 7 | %.%.%%.% %% %.%.%%.% 8 | %o%......P....%....% 9 | %%%%%%%%%%%%%%%%%%%% 10 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/mediumClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%% 2 | %o...%........%....% 3 | %.%%.%.%%%%%%.%.%%.% 4 | %.%..............%.% 5 | %.%.%%.%% %%.%%.%.% 6 | %......%G G%......% 7 | %.%.%%.%%%%%%.%%.%.% 8 | %.%..............%.% 9 | %.%%.%.%%%%%%.%.%%.% 10 | %....%...P....%...o% 11 | %%%%%%%%%%%%%%%%%%%% 12 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/mediumGrid.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%% 2 | %P % 3 | % .% . % 4 | % % % 5 | % .% . % 6 | % G% 7 | %%%%%%%% 8 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/minimaxClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%% 2 | %.P G% 3 | % %.%G%%% 4 | %G %%% 5 | %%%%%%%%% 6 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/openClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%% 2 | %.. P .... .... % 3 | %.. ... ... ... ... % 4 | %.. ... ... ... ... % 5 | %.. .... .... G % 6 | %.. ... ... ... ... % 7 | %.. ... ... ... ... % 8 | %.. .... .... o% 9 | %%%%%%%%%%%%%%%%%%%%%%%%% 10 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/originalClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | %............%%............% 3 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 4 | %o%%%%.%%%%%.%%.%%%%%.%%%%o% 5 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 6 | %..........................% 7 | %.%%%%.%%.%%%%%%%%.%%.%%%%.% 8 | %.%%%%.%%.%%%%%%%%.%%.%%%%.% 9 | %......%%....%%....%%......% 10 | %%%%%%.%%%%% %% %%%%%.%%%%%% 11 | %%%%%%.%%%%% %% %%%%%.%%%%%% 12 | %%%%%%.% %.%%%%%% 13 | %%%%%%.% %%%% %%%% %.%%%%%% 14 | % . %G GG G% . % 15 | %%%%%%.% %%%%%%%%%% %.%%%%%% 16 | %%%%%%.% %.%%%%%% 17 | %%%%%%.% %%%%%%%%%% %.%%%%%% 18 | %............%%............% 19 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 20 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 21 | %o..%%....... .......%%..o% 22 | %%%.%%.%%.%%%%%%%%.%%.%%.%%% 23 | %%%.%%.%%.%%%%%%%%.%%.%%.%%% 24 | %......%%....%%....%%......% 25 | %.%%%%%%%%%%.%%.%%%%%%%%%%.% 26 | %.............P............% 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%% 28 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/smallClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%% 2 | %......%G G%......% 3 | %.%%...%% %%...%%.% 4 | %.%o.%........%.o%.% 5 | %.%%.%.%%%%%%.%.%%.% 6 | %........P.........% 7 | %%%%%%%%%%%%%%%%%%%% 8 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/smallGrid.lay: -------------------------------------------------------------------------------- 1 | %%%%%%% 2 | % P % 3 | % %%% % 4 | % %. % 5 | % %%% % 6 | %. G % 7 | %%%%%%% 8 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/testClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%% 2 | % . % 3 | %.G.% 4 | % . % 5 | %. .% 6 | % % 7 | % .% 8 | % % 9 | %P .% 10 | %%%%% 11 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/trappedClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%% 2 | % P G% 3 | %G%%%%%% 4 | %.... % 5 | %%%%%%%% 6 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/layouts/trickyClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%% 2 | %o...%........%...o% 3 | %.%%.%.%%..%%.%.%%.% 4 | %.%.....%..%.....%.% 5 | %.%.%%.%% %%.%%.%.% 6 | %...... GGGG%.%....% 7 | %.%....%%%%%%.%..%.% 8 | %.%....% oo%.%..%.% 9 | %.%....% %%%%.%..%.% 10 | %.%...........%..%.% 11 | %.%%.%.%%%%%%.%.%%.% 12 | %o...%...P....%...o% 13 | %%%%%%%%%%%%%%%%%%%% 14 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/learningAgents.py: -------------------------------------------------------------------------------- 1 | # learningAgents.py 2 | # ----------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from game import Directions, Agent, Actions 10 | 11 | import random,util,time 12 | 13 | class ValueEstimationAgent(Agent): 14 | """ 15 | Abstract agent which assigns values to (state,action) 16 | Q-Values for an environment. As well as a value to a 17 | state and a policy given respectively by, 18 | 19 | V(s) = max_{a in actions} Q(s,a) 20 | policy(s) = arg_max_{a in actions} Q(s,a) 21 | 22 | Both ValueIterationAgent and QLearningAgent inherit 23 | from this agent. While a ValueIterationAgent has 24 | a model of the environment via a MarkovDecisionProcess 25 | (see mdp.py) that is used to estimate Q-Values before 26 | ever actually acting, the QLearningAgent estimates 27 | Q-Values while acting in the environment. 28 | """ 29 | 30 | def __init__(self, alpha=1.0, epsilon=0.05, gamma=0.8, numTraining = 10): 31 | """ 32 | Sets options, which can be passed in via the Pacman command line using -a alpha=0.5,... 33 | alpha - learning rate 34 | epsilon - exploration rate 35 | gamma - discount factor 36 | numTraining - number of training episodes, i.e. no learning after these many episodes 37 | """ 38 | self.alpha = float(alpha) 39 | self.epsilon = float(epsilon) 40 | self.discount = float(gamma) 41 | self.numTraining = int(numTraining) 42 | 43 | #################################### 44 | # Override These Functions # 45 | #################################### 46 | def getQValue(self, state, action): 47 | """ 48 | Should return Q(state,action) 49 | """ 50 | util.raiseNotDefined() 51 | 52 | def getValue(self, state): 53 | """ 54 | What is the value of this state under the best action? 55 | Concretely, this is given by 56 | 57 | V(s) = max_{a in actions} Q(s,a) 58 | """ 59 | util.raiseNotDefined() 60 | 61 | def getPolicy(self, state): 62 | """ 63 | What is the best action to take in the state. Note that because 64 | we might want to explore, this might not coincide with getAction 65 | Concretely, this is given by 66 | 67 | policy(s) = arg_max_{a in actions} Q(s,a) 68 | 69 | If many actions achieve the maximal Q-value, 70 | it doesn't matter which is selected. 71 | """ 72 | util.raiseNotDefined() 73 | 74 | def getAction(self, state): 75 | """ 76 | state: can call state.getLegalActions() 77 | Choose an action and return it. 78 | """ 79 | util.raiseNotDefined() 80 | 81 | class ReinforcementAgent(ValueEstimationAgent): 82 | """ 83 | Abstract Reinforcemnt Agent: A ValueEstimationAgent 84 | which estimates Q-Values (as well as policies) from experience 85 | rather than a model 86 | 87 | What you need to know: 88 | - The environment will call 89 | observeTransition(state,action,nextState,deltaReward), 90 | which will call update(state, action, nextState, deltaReward) 91 | which you should override. 92 | - Use self.getLegalActions(state) to know which actions 93 | are available in a state 94 | """ 95 | #################################### 96 | # Override These Functions # 97 | #################################### 98 | 99 | def update(self, state, action, nextState, reward): 100 | """ 101 | This class will call this function, which you write, after 102 | observing a transition and reward 103 | """ 104 | util.raiseNotDefined() 105 | 106 | #################################### 107 | # Read These Functions # 108 | #################################### 109 | 110 | def getLegalActions(self,state): 111 | """ 112 | Get the actions available for a given 113 | state. This is what you should use to 114 | obtain legal actions for a state 115 | """ 116 | return self.actionFn(state) 117 | 118 | def observeTransition(self, state,action,nextState,deltaReward): 119 | """ 120 | Called by environment to inform agent that a transition has 121 | been observed. This will result in a call to self.update 122 | on the same arguments 123 | 124 | NOTE: Do *not* override or call this function 125 | """ 126 | self.episodeRewards += deltaReward 127 | self.update(state,action,nextState,deltaReward) 128 | 129 | def startEpisode(self): 130 | """ 131 | Called by environment when new episode is starting 132 | """ 133 | self.lastState = None 134 | self.lastAction = None 135 | self.episodeRewards = 0.0 136 | 137 | def stopEpisode(self): 138 | """ 139 | Called by environment when episode is done 140 | """ 141 | if self.episodesSoFar < self.numTraining: 142 | self.accumTrainRewards += self.episodeRewards 143 | else: 144 | self.accumTestRewards += self.episodeRewards 145 | self.episodesSoFar += 1 146 | if self.episodesSoFar >= self.numTraining: 147 | # Take off the training wheels 148 | self.epsilon = 0.0 # no exploration 149 | self.alpha = 0.0 # no learning 150 | 151 | def isInTraining(self): 152 | return self.episodesSoFar < self.numTraining 153 | 154 | def isInTesting(self): 155 | return not self.isInTraining() 156 | 157 | def __init__(self, actionFn = None, numTraining=100, epsilon=0.5, alpha=0.5, gamma=1): 158 | """ 159 | actionFn: Function which takes a state and returns the list of legal actions 160 | 161 | alpha - learning rate 162 | epsilon - exploration rate 163 | gamma - discount factor 164 | numTraining - number of training episodes, i.e. no learning after these many episodes 165 | """ 166 | if actionFn == None: 167 | actionFn = lambda state: state.getLegalActions() 168 | self.actionFn = actionFn 169 | self.episodesSoFar = 0 170 | self.accumTrainRewards = 0.0 171 | self.accumTestRewards = 0.0 172 | self.numTraining = int(numTraining) 173 | self.epsilon = float(epsilon) 174 | self.alpha = float(alpha) 175 | self.discount = float(gamma) 176 | 177 | ################################ 178 | # Controls needed for Crawler # 179 | ################################ 180 | def setEpsilon(self, epsilon): 181 | self.epsilon = epsilon 182 | 183 | def setLearningRate(self, alpha): 184 | self.alpha = alpha 185 | 186 | def setDiscount(self, discount): 187 | self.discount = discount 188 | 189 | def doAction(self,state,action): 190 | """ 191 | Called by inherited class when 192 | an action is taken in a state 193 | """ 194 | self.lastState = state 195 | self.lastAction = action 196 | 197 | ################### 198 | # Pacman Specific # 199 | ################### 200 | def observationFunction(self, state): 201 | """ 202 | This is where we ended up after our last action. 203 | The simulation should somehow ensure this is called 204 | """ 205 | if not self.lastState is None: 206 | reward = state.getScore() - self.lastState.getScore() 207 | self.observeTransition(self.lastState, self.lastAction, state, reward) 208 | return state 209 | 210 | def registerInitialState(self, state): 211 | self.startEpisode() 212 | if self.episodesSoFar == 0: 213 | print 'Beginning %d episodes of Training' % (self.numTraining) 214 | 215 | def final(self, state): 216 | """ 217 | Called by Pacman game at the terminal state 218 | """ 219 | deltaReward = state.getScore() - self.lastState.getScore() 220 | self.observeTransition(self.lastState, self.lastAction, state, deltaReward) 221 | self.stopEpisode() 222 | 223 | # Make sure we have this var 224 | if not 'episodeStartTime' in self.__dict__: 225 | self.episodeStartTime = time.time() 226 | if not 'lastWindowAccumRewards' in self.__dict__: 227 | self.lastWindowAccumRewards = 0.0 228 | self.lastWindowAccumRewards += state.getScore() 229 | 230 | NUM_EPS_UPDATE = 100 231 | if self.episodesSoFar % NUM_EPS_UPDATE == 0: 232 | print 'Reinforcement Learning Status:' 233 | windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE) 234 | if self.episodesSoFar <= self.numTraining: 235 | trainAvg = self.accumTrainRewards / float(self.episodesSoFar) 236 | print '\tCompleted %d out of %d training episodes' % ( 237 | self.episodesSoFar,self.numTraining) 238 | print '\tAverage Rewards over all training: %.2f' % ( 239 | trainAvg) 240 | else: 241 | testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining) 242 | print '\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining) 243 | print '\tAverage Rewards over testing: %.2f' % testAvg 244 | print '\tAverage Rewards for last %d episodes: %.2f' % ( 245 | NUM_EPS_UPDATE,windowAvg) 246 | print '\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime) 247 | self.lastWindowAccumRewards = 0.0 248 | self.episodeStartTime = time.time() 249 | 250 | if self.episodesSoFar == self.numTraining: 251 | msg = 'Training Done (turning off epsilon and alpha)' 252 | print '%s\n%s' % (msg,'-' * len(msg)) 253 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/mdp.py: -------------------------------------------------------------------------------- 1 | # mdp.py 2 | # ------ 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | import random 10 | 11 | class MarkovDecisionProcess: 12 | 13 | def getStates(self): 14 | """ 15 | Return a list of all states in the MDP. 16 | Not generally possible for large MDPs. 17 | """ 18 | abstract 19 | 20 | def getStartState(self): 21 | """ 22 | Return the start state of the MDP. 23 | """ 24 | abstract 25 | 26 | def getPossibleActions(self, state): 27 | """ 28 | Return list of possible actions from 'state'. 29 | """ 30 | abstract 31 | 32 | def getTransitionStatesAndProbs(self, state, action): 33 | """ 34 | Returns list of (nextState, prob) pairs 35 | representing the states reachable 36 | from 'state' by taking 'action' along 37 | with their transition probabilities. 38 | 39 | Note that in Q-Learning and reinforcment 40 | learning in general, we do not know these 41 | probabilities nor do we directly model them. 42 | """ 43 | abstract 44 | 45 | def getReward(self, state, action, nextState): 46 | """ 47 | Get the reward for the state, action, nextState transition. 48 | 49 | Not available in reinforcement learning. 50 | """ 51 | abstract 52 | 53 | def isTerminal(self, state): 54 | """ 55 | Returns true if the current state is a terminal state. By convention, 56 | a terminal state has zero future rewards. Sometimes the terminal state(s) 57 | may have no possible actions. It is also common to think of the terminal 58 | state as having a self-loop action 'pass' with zero reward; the formulations 59 | are equivalent. 60 | """ 61 | abstract 62 | 63 | 64 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/pacmanAgents.py: -------------------------------------------------------------------------------- 1 | # pacmanAgents.py 2 | # --------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from pacman import Directions 10 | from game import Agent 11 | import random 12 | import game 13 | import util 14 | 15 | class LeftTurnAgent(game.Agent): 16 | "An agent that turns left at every opportunity" 17 | 18 | def getAction(self, state): 19 | legal = state.getLegalPacmanActions() 20 | current = state.getPacmanState().configuration.direction 21 | if current == Directions.STOP: current = Directions.NORTH 22 | left = Directions.LEFT[current] 23 | if left in legal: return left 24 | if current in legal: return current 25 | if Directions.RIGHT[current] in legal: return Directions.RIGHT[current] 26 | if Directions.LEFT[left] in legal: return Directions.LEFT[left] 27 | return Directions.STOP 28 | 29 | class GreedyAgent(Agent): 30 | def __init__(self, evalFn="scoreEvaluation"): 31 | self.evaluationFunction = util.lookup(evalFn, globals()) 32 | assert self.evaluationFunction != None 33 | 34 | def getAction(self, state): 35 | # Generate candidate actions 36 | legal = state.getLegalPacmanActions() 37 | if Directions.STOP in legal: legal.remove(Directions.STOP) 38 | 39 | successors = [(state.generateSuccessor(0, action), action) for action in legal] 40 | scored = [(self.evaluationFunction(state), action) for state, action in successors] 41 | bestScore = max(scored)[0] 42 | bestActions = [pair[1] for pair in scored if pair[0] == bestScore] 43 | return random.choice(bestActions) 44 | 45 | def scoreEvaluation(state): 46 | return state.getScore() -------------------------------------------------------------------------------- /week3_model_free/seminar_main/qlearningAgents.py: -------------------------------------------------------------------------------- 1 | # qlearningAgents.py 2 | # ------------------ 3 | ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 4 | 5 | from game import * 6 | from learningAgents import ReinforcementAgent 7 | from featureExtractors import * 8 | 9 | import random,util,math 10 | from collections import defaultdict 11 | 12 | class QLearningAgent(ReinforcementAgent): 13 | """ 14 | Q-Learning Agent 15 | 16 | Instance variables you have access to 17 | - self.epsilon (exploration prob) 18 | - self.alpha (learning rate) 19 | - self.discount (discount rate aka gamma) 20 | 21 | Functions you should use 22 | - self.getLegalActions(state) 23 | which returns legal actions for a state 24 | - self.getQValue(state,action) 25 | which returns Q(state,action) 26 | - self.setQValue(state,action,value) 27 | which sets Q(state,action) := value 28 | 29 | !!!Important!!! 30 | NOTE: please avoid using self._qValues directly to make code cleaner 31 | """ 32 | def __init__(self, **args): 33 | "We initialize agent and Q-values here." 34 | ReinforcementAgent.__init__(self, **args) 35 | self._qValues = defaultdict(lambda:defaultdict(lambda:0)) 36 | 37 | 38 | def getQValue(self, state, action): 39 | """ 40 | Returns Q(state,action) 41 | """ 42 | return self._qValues[state][action] 43 | 44 | def setQValue(self,state,action,value): 45 | """ 46 | Sets the Qvalue for [state,action] to the given value 47 | """ 48 | self._qValues[state][action] = value 49 | 50 | #---------------------#start of your code#---------------------# 51 | 52 | def getValue(self, state): 53 | """ 54 | Returns max_action Q(state,action) 55 | where the max is over legal actions. 56 | """ 57 | 58 | possibleActions = self.getLegalActions(state) 59 | #If there are no legal actions, return 0.0 60 | if len(possibleActions) == 0: 61 | return 0.0 62 | 63 | "*** YOUR CODE HERE ***" 64 | raise NotImplementedError 65 | 66 | return 0. 67 | 68 | def getPolicy(self, state): 69 | """ 70 | Compute the best action to take in a state. 71 | 72 | """ 73 | possibleActions = self.getLegalActions(state) 74 | 75 | #If there are no legal actions, return None 76 | if len(possibleActions) == 0: 77 | return None 78 | 79 | best_action = None 80 | 81 | "*** YOUR CODE HERE ***" 82 | raise NotImplementedError 83 | 84 | return best_action 85 | 86 | def getAction(self, state): 87 | """ 88 | Compute the action to take in the current state, including exploration. 89 | 90 | With probability self.epsilon, we should take a random action. 91 | otherwise - the best policy action (self.getPolicy). 92 | 93 | HINT: You might want to use util.flipCoin(prob) 94 | HINT: To pick randomly from a list, use random.choice(list) 95 | 96 | """ 97 | 98 | # Pick Action 99 | possibleActions = self.getLegalActions(state) 100 | action = None 101 | 102 | #If there are no legal actions, return None 103 | if len(possibleActions) == 0: 104 | return None 105 | 106 | #agent parameters: 107 | epsilon = self.epsilon 108 | 109 | "*** YOUR CODE HERE ***" 110 | raise NotImplementedError 111 | 112 | return action 113 | 114 | def update(self, state, action, nextState, reward): 115 | """ 116 | You should do your Q-Value update here 117 | 118 | NOTE: You should never call this function, 119 | it will be called on your behalf 120 | 121 | 122 | """ 123 | #agent parameters 124 | gamma = self.discount 125 | learning_rate = self.alpha 126 | 127 | "*** YOUR CODE HERE ***" 128 | raise NotImplementedError 129 | 130 | reference_qvalue = PleaseImplementMe 131 | updated_qvalue = PleaseImplementMe 132 | 133 | self.setQValue(PleaseImplementMe,PleaseImplementMe,updated_qvalue) 134 | 135 | 136 | #---------------------#end of your code#---------------------# 137 | 138 | 139 | 140 | class PacmanQAgent(QLearningAgent): 141 | "Exactly the same as QLearningAgent, but with different default parameters" 142 | 143 | def __init__(self, epsilon=0.05,gamma=0.8,alpha=0.2, numTraining=0, **args): 144 | """ 145 | These default parameters can be changed from the pacman.py command line. 146 | For example, to change the exploration rate, try: 147 | python pacman.py -p PacmanQLearningAgent -a epsilon=0.1 148 | 149 | alpha - learning rate 150 | epsilon - exploration rate 151 | gamma - discount factor 152 | numTraining - number of training episodes, i.e. no learning after these many episodes 153 | """ 154 | args['epsilon'] = epsilon 155 | args['gamma'] = gamma 156 | args['alpha'] = alpha 157 | args['numTraining'] = numTraining 158 | self.index = 0 # This is always Pacman 159 | QLearningAgent.__init__(self, **args) 160 | 161 | def getAction(self, state): 162 | """ 163 | Simply calls the getAction method of QLearningAgent and then 164 | informs parent of action for Pacman. Do not change or remove this 165 | method. 166 | """ 167 | action = QLearningAgent.getAction(self,state) 168 | self.doAction(state,action) 169 | return action 170 | 171 | 172 | 173 | class ApproximateQAgent(PacmanQAgent): 174 | pass 175 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/run_crawler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python crawler.py 3 | 4 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/run_grid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python gridworld.py -a q -k 100 -n 0 -g BookGrid -e 0.5 3 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/run_pacman.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python pacman.py -p PacmanQAgent -x 1000 -n 1010 -l smallGrid 3 | 4 | # -x is the amount of training epochs, -n is the total amount of epochs. 5 | # hence, if you want to spend 1337 epochs training and then play 42 for evalution, you will need 6 | # python pacman.py -p PacmanQAgent -x 1337 -n 1379 -l smallGrid 7 | -------------------------------------------------------------------------------- /week3_model_free/seminar_main/textDisplay.py: -------------------------------------------------------------------------------- 1 | # textDisplay.py 2 | # -------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | import pacman, time 10 | 11 | DRAW_EVERY = 1 12 | SLEEP_TIME = 0 # This can be overwritten by __init__ 13 | DISPLAY_MOVES = False 14 | QUIET = False # Supresses output 15 | 16 | class NullGraphics: 17 | def initialize(self, state, isBlue = False): 18 | pass 19 | 20 | def update(self, state): 21 | pass 22 | 23 | def pause(self): 24 | time.sleep(SLEEP_TIME) 25 | 26 | def draw(self, state): 27 | print state 28 | 29 | def finish(self): 30 | pass 31 | 32 | class PacmanGraphics: 33 | def __init__(self, speed=None): 34 | if speed != None: 35 | global SLEEP_TIME 36 | SLEEP_TIME = speed 37 | 38 | def initialize(self, state, isBlue = False): 39 | self.draw(state) 40 | self.pause() 41 | self.turn = 0 42 | self.agentCounter = 0 43 | 44 | def update(self, state): 45 | numAgents = len(state.agentStates) 46 | self.agentCounter = (self.agentCounter + 1) % numAgents 47 | if self.agentCounter == 0: 48 | self.turn += 1 49 | if DISPLAY_MOVES: 50 | ghosts = [pacman.nearestPoint(state.getGhostPosition(i)) for i in range(1, numAgents)] 51 | print "%4d) P: %-8s" % (self.turn, str(pacman.nearestPoint(state.getPacmanPosition()))),'| Score: %-5d' % state.score,'| Ghosts:', ghosts 52 | if self.turn % DRAW_EVERY == 0: 53 | self.draw(state) 54 | self.pause() 55 | if state._win or state._lose: 56 | self.draw(state) 57 | 58 | def pause(self): 59 | time.sleep(SLEEP_TIME) 60 | 61 | def draw(self, state): 62 | print state 63 | 64 | def finish(self): 65 | pass 66 | -------------------------------------------------------------------------------- /week4_[recap]_deep_learning/README.md: -------------------------------------------------------------------------------- 1 | __Note:__ This week's materials cover the basics of neural nets and deep learning and teach you how to use auto-diff frameworks. If you're already fluent in tensorflow OR pytorch OR theano - feel free to skip this week entirely.. 2 | 3 | ## Materials 4 | * [__Lecture slides__](https://yadi.sk/i/yAO2AJ3M3EKP8g) 5 | 6 | - __In russian:__ 7 | * Basic lecture on deep learning - [video](https://yadi.sk/i/yyHZub6R3Ej5dV) 8 | * Deep learning frameworks - [video](https://yadi.sk/i/hDIkaR4H3EtnXM) 9 | * [Pytorch tutorial](https://yadi.sk/i/O3mQ76u43So3h9) __recommended__ 10 | * [Tensorflow tutorial](https://www.youtube.com/watch?v=FQ660T4uu7k) (english only for now. Links are welcome) 11 | * [Theano tutorial](https://yadi.sk/i/54STsEBVpubkn) 12 | 13 | - __In english:__ 14 | * Intro to neural nets and backprop (english) - [video](https://www.youtube.com/watch?v=uXt8qF2Zzfo) 15 | * Intro to convnets - [video](https://www.youtube.com/watch?v=FmpDIaiMIeA) 16 | * Deep learning frameworks - [video](https://www.youtube.com/watch?v=Vf_-OkqbwPo) 17 | * [Tensorflow tutorial](https://www.youtube.com/watch?v=FQ660T4uu7k) 18 | * [Theano tutorial](https://www.youtube.com/watch?v=OU8I1oJ9HhI) 19 | * [Pytorch tutorial](https://www.youtube.com/watch?v=VMcRWYEKmhw) 20 | 21 | ## Bonus materials 22 | * Karpathy's course on deep learning (english) - http://cs231n.github.io/ 23 | * A neat little play-ground where you can train small NNs and see what they actually learn - [playground](http://playground.tensorflow.org/) 24 | * Nuts and Bolts of deep learning by Andrew Ng (english) - [video](https://www.youtube.com/watch?v=F1ka6a13S9I) 25 | * Deep learning philosophy: [our humble take](https://www.youtube.com/watch?v=9qyE1Ev1Xdw) (english) 26 | * Deep learning demystified - [video](https://www.youtube.com/watch?v=Q9Z20HCPnww) 27 | * Karpathy's lecture on deep learning for computer vision - https://www.youtube.com/watch?v=u6aEYuemt0M 28 | * Our humble DL course: [HSE'fall17](https://github.com/yandexdataschool/HSE_deeplearning), [Skoltech/YSDA'spring16](https://github.com/ddtm/dl-course/) courses on deep learning (english). 29 | * Srsly, just google `"deep learning %s"%s for s in what_you_want_to_know`. 30 | 31 | 32 | ### Practice 33 | From now on, we'll have two tracks: theano and tensorflow. We'll also add pytorch seminars as soon as they're ready. 34 | 35 | Please pick seminar_theano.ipynb, seminar_tensorflow.ipynb or seminar_pytorch.ipynb. 36 | 37 | __Note:__ in this and all following weeks you're only required to get through practice in _one_ of the frameworks. Looking into other alternatives is great for self-education but never mandatory. 38 | 39 | #### What to choose? 40 | * The simplest choice is PyTorch: it's basically ye olde numpy with automatic gradients and a lot of pre-implemented DL stuff... except all the functions have different names. 41 | * If you want to be familiar with production-related stuff from day 1, choose TensorFlow. It's much more convenient to deploy (to non-python or to mobiles). The catch is that all those conveniences become inconveniences once you want to write something simple in jupyter. 42 | * Theano works like tensorflow but it offers a numpy-compatible interface and comes with built-in graph optimization. The payoff is that theano is not as popular as the first two. It is also not meant as a producton framework so deploying to mobiles may be a problem. 43 | 44 | * It's not like choosing house at Hogwarts, you'll be able to switch between frameworks easily once you master the underlying principles. 45 | 46 | -------------------------------------------------------------------------------- /week4_[recap]_deep_learning/fix_my_nn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from lasagne.layers import *\n", 12 | "from lasagne.nonlinearities import *\n", 13 | "from lasagne import init" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "nn = InputLayer([None,3,100,100])\n", 25 | "\n", 26 | "nn = Conv2DLayer(nn,num_filters=512, filter_size=(3,3),\n", 27 | " W = init.Constant(0))\n", 28 | "\n", 29 | "nn = Conv2DLayer(nn,num_filters=128,filter_size=(3,3),\n", 30 | " W = init.Constant(0))\n", 31 | "\n", 32 | "nn = Conv2DLayer(nn,num_filters=32,filter_size=(3,3),\n", 33 | " W = init.Constant(0))\n", 34 | "\n", 35 | "nn = Pool2DLayer(nn,pool_size=(6,6),mode='max')\n", 36 | "\n", 37 | "nn = Conv2DLayer(nn,num_filters=8,filter_size=(10,10),\n", 38 | " W = init.Normal(std=0.01))\n", 39 | "\n", 40 | "nn = Conv2DLayer(nn,num_filters=8,filter_size=(10,10),\n", 41 | " W = init.Normal(std=0.01))\n", 42 | "\n", 43 | "nn = Pool2DLayer(nn,pool_size=(3,3),mode='max')\n", 44 | "\n", 45 | "nn = DenseLayer(nn,512,nonlinearity=softmax)\n", 46 | "\n", 47 | "nn = DropoutLayer(nn,p=0.5)\n", 48 | "\n", 49 | "nn = DenseLayer(nn,512,nonlinearity=softmax)\n", 50 | "\n", 51 | "nn = DenseLayer(nn,10,nonlinearity=sigmoid)\n", 52 | "\n", 53 | "nn = DropoutLayer(nn,p=0.5)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "```\n", 61 | "\n", 62 | "```\n", 63 | "\n", 64 | "```\n", 65 | "\n", 66 | "```\n", 67 | "\n", 68 | "```\n", 69 | "\n", 70 | "```\n", 71 | "\n", 72 | "```\n", 73 | "\n", 74 | "```\n", 75 | "\n", 76 | "```\n", 77 | "\n", 78 | "```\n", 79 | "\n", 80 | "```\n", 81 | "\n", 82 | "```\n", 83 | "\n", 84 | "```\n", 85 | "\n", 86 | "```\n", 87 | "\n", 88 | "```\n", 89 | "\n", 90 | "```\n", 91 | "\n", 92 | "```\n", 93 | "\n", 94 | "```\n", 95 | "\n", 96 | "```\n", 97 | "\n", 98 | "```\n", 99 | "\n", 100 | "```\n", 101 | "\n", 102 | "```\n", 103 | "\n", 104 | "```\n", 105 | "\n", 106 | "```\n", 107 | "\n", 108 | "```\n", 109 | "\n", 110 | "```\n", 111 | "\n", 112 | "```\n", 113 | "\n", 114 | "```\n", 115 | "\n", 116 | "```\n", 117 | "\n", 118 | "```\n", 119 | "\n", 120 | "\n", 121 | "# Book of grudges\n", 122 | "* zero init for weights will cause symmetry effect\n", 123 | "* Too many filters for first 3x3 convolution - will lead to enormous matrix while there's just not enough relevant combinations of 3x3 images (overkill).\n", 124 | "* Usually the further you go, the more filters you need.\n", 125 | "* large filters (10x10 is generally a bad pactice, and you definitely need more than 10 of them\n", 126 | "* the second of 10x10 convolution gets 8x6x6 image as input, so it's technically unable to perform such convolution.\n", 127 | "* Softmax nonlinearity effectively makes only 1 or a few neurons from the entire layer to \"fire\", rendering 512-neuron layer almost useless. Softmax at the output layer is okay though\n", 128 | "* Dropout after probability prediciton is just lame. A few random classes get probability of 0, so your probabilities no longer sum to 1 and crossentropy goes -inf." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [] 139 | } 140 | ], 141 | "metadata": { 142 | "kernelspec": { 143 | "display_name": "Python [Root]", 144 | "language": "python", 145 | "name": "Python [Root]" 146 | }, 147 | "language_info": { 148 | "codemirror_mode": { 149 | "name": "ipython", 150 | "version": 2 151 | }, 152 | "file_extension": ".py", 153 | "mimetype": "text/x-python", 154 | "name": "python", 155 | "nbconvert_exporter": "python", 156 | "pygments_lexer": "ipython2", 157 | "version": "2.7.12" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 0 162 | } 163 | -------------------------------------------------------------------------------- /week4_[recap]_deep_learning/mnist.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | 5 | import numpy as np 6 | 7 | __doc__="""taken from https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py""" 8 | 9 | def load_dataset(): 10 | # We first define a download function, supporting both Python 2 and 3. 11 | if sys.version_info[0] == 2: 12 | from urllib import urlretrieve 13 | else: 14 | from urllib.request import urlretrieve 15 | 16 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'): 17 | print("Downloading %s" % filename) 18 | urlretrieve(source + filename, filename) 19 | 20 | # We then define functions for loading MNIST images and labels. 21 | # For convenience, they also download the requested files if needed. 22 | import gzip 23 | 24 | def load_mnist_images(filename): 25 | if not os.path.exists(filename): 26 | download(filename) 27 | # Read the inputs in Yann LeCun's binary format. 28 | with gzip.open(filename, 'rb') as f: 29 | data = np.frombuffer(f.read(), np.uint8, offset=16) 30 | # The inputs are vectors now, we reshape them to monochrome 2D images, 31 | # following the shape convention: (examples, channels, rows, columns) 32 | data = data.reshape(-1, 1, 28, 28) 33 | # The inputs come as bytes, we convert them to float32 in range [0,1]. 34 | # (Actually to range [0, 255/256], for compatibility to the version 35 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.) 36 | return data / np.float32(256) 37 | 38 | def load_mnist_labels(filename): 39 | if not os.path.exists(filename): 40 | download(filename) 41 | # Read the labels in Yann LeCun's binary format. 42 | with gzip.open(filename, 'rb') as f: 43 | data = np.frombuffer(f.read(), np.uint8, offset=8) 44 | # The labels are vectors of integers now, that's exactly what we want. 45 | return data 46 | 47 | # We can now download and read the training and test set images and labels. 48 | X_train = load_mnist_images('train-images-idx3-ubyte.gz') 49 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz') 50 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz') 51 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz') 52 | 53 | # We reserve the last 10000 training examples for validation. 54 | X_train, X_val = X_train[:-10000], X_train[-10000:] 55 | y_train, y_val = y_train[:-10000], y_train[-10000:] 56 | 57 | # We just return all the arrays in order, as expected in main(). 58 | # (It doesn't matter how we do this as long as we can read them again.) 59 | return X_train, y_train, X_val, y_val, X_test, y_test 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /week4_[recap]_deep_learning/notmnist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from scipy.misc import imread,imresize 4 | from sklearn.model_selection import train_test_split 5 | from glob import glob 6 | 7 | def load_notmnist(path='./notMNIST_small',letters='ABCDEFGHIJ', 8 | img_shape=(28,28),test_size=0.25,one_hot=False): 9 | 10 | # download data if it's missing. If you have any problems, go to the urls and load it manually. 11 | if not os.path.exists(path): 12 | print("Downloading data...") 13 | assert os.system('curl http://yaroslavvb.com/upload/notMNIST/notMNIST_small.tar.gz > notMNIST_small.tar.gz') == 0 14 | print("Extracting ...") 15 | assert os.system('tar -zxvf notMNIST_small.tar.gz > untar_notmnist.log') == 0 16 | 17 | data,labels = [],[] 18 | print("Parsing...") 19 | for img_path in glob(os.path.join(path,'*/*')): 20 | class_i = img_path.split(os.sep)[-2] 21 | if class_i not in letters: continue 22 | try: 23 | data.append(imresize(imread(img_path), img_shape)) 24 | labels.append(class_i,) 25 | except: 26 | print("found broken img: %s [it's ok if <10 images are broken]" % img_path) 27 | 28 | data = np.stack(data)[:,None].astype('float32') 29 | data = (data - np.mean(data)) / np.std(data) 30 | 31 | #convert classes to ints 32 | letter_to_i = {l:i for i,l in enumerate(letters)} 33 | labels = np.array(list(map(letter_to_i.get, labels))) 34 | 35 | if one_hot: 36 | labels = (np.arange(np.max(labels) + 1)[None,:] == labels[:, None]).astype('float32') 37 | 38 | #split into train/test 39 | X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=test_size, random_state=42) 40 | 41 | print("Done") 42 | return X_train, y_train, X_test, y_test 43 | 44 | -------------------------------------------------------------------------------- /week4_approx_rl/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [__lecture slides I__](https://yadi.sk/i/kGPiXpse3NR3n8), [__slides II__](https://yadi.sk/i/H07O_XEh3NR3oV) 3 | * Our [lecture](https://yadi.sk/i/AHDU2p_j3FT3nr), [second lecture](https://yadi.sk/i/yBO0q4mI3GAxYd), [seminar](https://yadi.sk/i/EeUeheri3FT3ra) (russian) 4 | 5 | 6 | * David Silver lecture - [video](https://www.youtube.com/watch?v=UoPei5o4fps) 7 | * More practical and less theoretical lecture from MIT 6.S191 - [video](https://www.youtube.com/watch?v=xWe58WGWmlk) 8 | * Understanding approximate q-learning - [url](https://danieltakeshi.github.io/2016/10/31/going-deeper-into-reinforcement-learning-understanding-q-learning-and-linear-function-approximation/) 9 | * Karpathy's post on approximate RL - [url](http://karpathy.github.io/2016/05/31/rl/) 10 | 11 | ## More materials 12 | * __[recommended]__ How to _actually_ do deep reinforcement learning by J. Schulman - [pdf](http://rll.berkeley.edu/deeprlcourse/docs/nuts-and-bolts.pdf) 13 | * __[recommended]__ An overview of deep reinforcement learning - [arxiv](https://arxiv.org/pdf/1701.07274v1.pdf) 14 | * DQN and modiffications - lecture by J. Schulman - [video](https://www.youtube.com/watch?v=h1-pj4Y9-kM) 15 | * * interactive demos in your browser: [demo1](http://cs.stanford.edu/people/karpathy/convnetjs/demo/rldemo.html)(karpathy), [demo2](http://janhuenermann.com/projects/learning-to-drive)(Hünermann) 16 | * Reinforcement learning architectures list - [repo](https://github.com/5vision/deep-reinforcement-learning-networks) 17 | * Article on dueling DQN - [arxiv](https://arxiv.org/pdf/1511.06581.pdf) 18 | * Article on double DQN - [arxiv](https://arxiv.org/abs/1509.06461) 19 | * Article on prioritized experience replay - [arxiv](https://arxiv.org/abs/1511.05952) 20 | * Article on bootstrap DQN - [pdf](https://papers.nips.cc/paper/6501-deep-exploration-via-bootstrapped-dqn.pdf), [summary](http://pemami4911.github.io/paper-summaries/2016/08/16/Deep-exploration.html) 21 | * Article on asynchronuous methods in deep RL - [arxiv](https://arxiv.org/abs/1602.01783) 22 | * Successor representations for reinforcement learning - [article](https://arxiv.org/abs/1606.02396), [video](https://www.youtube.com/watch?v=kNqXCn7K-BM&feature=youtu.be) 23 | * Video on asynchronuous methods (Mnih) - [video](https://www.youtube.com/watch?v=9sx1_u2qVhQ) 24 | 25 | ## DQN tutorials 26 | * [in pytorch] A great series starting from simple DQN to all the cool new stuff - [url](https://github.com/higgsfield/RL-Adventure) 27 | * A guide to deep RL from ~scratch (nervana blog) - [url](https://www.nervanasys.com/demystifying-deep-reinforcement-learning/) 28 | * Building deep q-network from ~scratch (blog) - [url](https://jaromiru.com/2016/09/27/lets-make-a-dqn-theory/) 29 | * Another guide guide to DQN from ~scratch (blog) - [url](https://rubenfiszel.github.io/posts/rl4j/2016-08-24-Reinforcement-Learning-and-DQN.html) 30 | 31 | 32 | ## Practice 33 | 34 | From now on, we have two tracks, theano and tensorflow. We'll also add pytorch support soon. 35 | 36 | You can choose whichever track you want, but unless you're expertly familiar with your framework, we recommend you to start by completing the task in lasagne and only then reproduce your solution in your chosen framework. 37 | 38 | Begin with `seminar_.ipynb` and then proceed with `homework_.ipynb`. 39 | 40 | __Note: you're not required to submit assignments in all three frameworks. Pick one and go with it. Maybe switch it occasionally if you want more challenge. __ 41 | -------------------------------------------------------------------------------- /week4_approx_rl/framebuffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.spaces.box import Box 3 | from gym.core import Wrapper 4 | class FrameBuffer(Wrapper): 5 | def __init__(self, env, n_frames=4, dim_order='tensorflow'): 6 | """A gym wrapper that reshapes, crops and scales image into the desired shapes""" 7 | super(FrameBuffer, self).__init__(env) 8 | self.dim_order = dim_order 9 | if dim_order == 'tensorflow': 10 | height, width, n_channels = env.observation_space.shape 11 | obs_shape = [height, width, n_channels * n_frames] 12 | elif dim_order == 'pytorch': 13 | n_channels, height, width = env.observation_space.shape 14 | obs_shape = [n_channels * n_frames, height, width] 15 | else: 16 | raise ValueError('dim_order should be "tensorflow" or "pytorch", got {}'.format(dim_order)) 17 | self.observation_space = Box(0.0, 1.0, obs_shape) 18 | self.framebuffer = np.zeros(obs_shape, 'float32') 19 | 20 | def reset(self): 21 | """resets breakout, returns initial frames""" 22 | self.framebuffer = np.zeros_like(self.framebuffer) 23 | self.update_buffer(self.env.reset()) 24 | return self.framebuffer 25 | 26 | def step(self, action): 27 | """plays breakout for 1 step, returns frame buffer""" 28 | new_img, reward, done, info = self.env.step(action) 29 | self.update_buffer(new_img) 30 | return self.framebuffer, reward, done, info 31 | 32 | def update_buffer(self, img): 33 | if self.dim_order == 'tensorflow': 34 | offset = self.env.observation_space.shape[-1] 35 | axis = -1 36 | cropped_framebuffer = self.framebuffer[:,:,:-offset] 37 | elif self.dim_order == 'pytorch': 38 | offset = self.env.observation_space.shape[0] 39 | axis = 0 40 | cropped_framebuffer = self.framebuffer[:-offset] 41 | self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis) 42 | -------------------------------------------------------------------------------- /week4_approx_rl/replay_buffer.py: -------------------------------------------------------------------------------- 1 | # This code is shamelessly stolen from https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py 2 | import numpy as np 3 | import random 4 | 5 | class ReplayBuffer(object): 6 | def __init__(self, size): 7 | """Create Replay buffer. 8 | Parameters 9 | ---------- 10 | size: int 11 | Max number of transitions to store in the buffer. When the buffer 12 | overflows the old memories are dropped. 13 | """ 14 | self._storage = [] 15 | self._maxsize = size 16 | self._next_idx = 0 17 | 18 | def __len__(self): 19 | return len(self._storage) 20 | 21 | def add(self, obs_t, action, reward, obs_tp1, done): 22 | data = (obs_t, action, reward, obs_tp1, done) 23 | 24 | if self._next_idx >= len(self._storage): 25 | self._storage.append(data) 26 | else: 27 | self._storage[self._next_idx] = data 28 | self._next_idx = (self._next_idx + 1) % self._maxsize 29 | 30 | def _encode_sample(self, idxes): 31 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 32 | for i in idxes: 33 | data = self._storage[i] 34 | obs_t, action, reward, obs_tp1, done = data 35 | obses_t.append(np.array(obs_t, copy=False)) 36 | actions.append(np.array(action, copy=False)) 37 | rewards.append(reward) 38 | obses_tp1.append(np.array(obs_tp1, copy=False)) 39 | dones.append(done) 40 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 41 | 42 | def sample(self, batch_size): 43 | """Sample a batch of experiences. 44 | Parameters 45 | ---------- 46 | batch_size: int 47 | How many transitions to sample. 48 | Returns 49 | ------- 50 | obs_batch: np.array 51 | batch of observations 52 | act_batch: np.array 53 | batch of actions executed given obs_batch 54 | rew_batch: np.array 55 | rewards received as results of executing act_batch 56 | next_obs_batch: np.array 57 | next set of observations seen after executing act_batch 58 | done_mask: np.array 59 | done_mask[i] = 1 if executing act_batch[i] resulted in 60 | the end of an episode and 0 otherwise. 61 | """ 62 | idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 63 | return self._encode_sample(idxes) 64 | -------------------------------------------------------------------------------- /week4_approx_rl/seminar_lasagne.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Approximate q-learning\n", 8 | "\n", 9 | "In this notebook you will teach a lasagne neural network to do Q-learning." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "__Frameworks__ - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "%env THEANO_FLAGS='floatX=float32'\n", 28 | "import os\n", 29 | "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\"))==0:\n", 30 | " !bash ../xvfb start\n", 31 | " %env DISPLAY=:1" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import gym\n", 43 | "import numpy as np, pandas as pd\n", 44 | "import matplotlib.pyplot as plt\n", 45 | "%matplotlib inline" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": true, 53 | "scrolled": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "env = gym.make(\"CartPole-v0\").env\n", 58 | "env.reset()\n", 59 | "n_actions = env.action_space.n\n", 60 | "state_dim = env.observation_space.shape\n", 61 | "\n", 62 | "plt.imshow(env.render(\"rgb_array\"))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "# Approximate (deep) Q-learning: building the network\n", 70 | "\n", 71 | "In this section we will build and train naive Q-learning with theano/lasagne" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "First step is initializing input variables" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "import theano\n", 90 | "import theano.tensor as T\n", 91 | "\n", 92 | "#create input variables. We'll support multiple states at once\n", 93 | "\n", 94 | "\n", 95 | "current_states = T.matrix(\"states[batch,units]\")\n", 96 | "actions = T.ivector(\"action_ids[batch]\")\n", 97 | "rewards = T.vector(\"rewards[batch]\")\n", 98 | "next_states = T.matrix(\"next states[batch,units]\")\n", 99 | "is_end = T.ivector(\"vector[batch] where 1 means that session just ended\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "import lasagne\n", 111 | "from lasagne.layers import *\n", 112 | "\n", 113 | "#input layer\n", 114 | "l_states = InputLayer((None,)+state_dim)\n", 115 | "\n", 116 | "\n", 117 | "\n", 118 | "\n", 119 | "\n", 120 | "#output layer\n", 121 | "l_qvalues = DenseLayer(,num_units=n_actions,nonlinearity=None)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "#### Predicting Q-values for `current_states`" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "#get q-values for ALL actions in current_states\n", 140 | "predicted_qvalues = get_output(l_qvalues,{l_states:current_states})" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "#compiling agent's \"GetQValues\" function\n", 152 | "get_qvalues = " 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "#select q-values for chosen actions\n", 164 | "predicted_qvalues_for_actions = predicted_qvalues[T.arange(actions.shape[0]),actions]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "#### Loss function and `update`\n", 172 | "Here we write a function similar to `agent.update`." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "#predict q-values for next states\n", 184 | "predicted_next_qvalues = get_output(l_qvalues,{l_states:})\n", 185 | "\n", 186 | "\n", 187 | "#Computing target q-values under \n", 188 | "gamma = 0.99\n", 189 | "target_qvalues_for_actions = \n", 190 | "\n", 191 | "#zero-out q-values at the end\n", 192 | "target_qvalues_for_actions = (1-is_end)*target_qvalues_for_actions\n", 193 | "\n", 194 | "#don't compute gradient over target q-values (consider constant)\n", 195 | "target_qvalues_for_actions = theano.gradient.disconnected_grad(target_qvalues_for_actions)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "\n", 207 | "#mean squared error loss function\n", 208 | "loss = \n" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "#all network weights\n", 220 | "all_weights = get_all_params(l_qvalues,trainable=True)\n", 221 | "\n", 222 | "#network updates. Note the small learning rate (for stability)\n", 223 | "updates = lasagne.updates.sgd(loss,all_weights,learning_rate=1e-4)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": true 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "#Training function that resembles agent.update(state,action,reward,next_state) \n", 235 | "#with 1 more argument meaning is_end\n", 236 | "train_step = theano.function([current_states,actions,rewards,next_states,is_end],\n", 237 | " updates=updates)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "### Playing the game" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": true 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "epsilon = 0.25 #initial epsilon\n", 256 | "\n", 257 | "def generate_session(t_max=1000):\n", 258 | " \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n", 259 | " \n", 260 | " total_reward = 0\n", 261 | " s = env.reset()\n", 262 | " \n", 263 | " for t in range(t_max):\n", 264 | " \n", 265 | " #get action q-values from the network\n", 266 | " q_values = get_qvalues([s])[0] \n", 267 | " \n", 268 | " a = \n", 269 | " \n", 270 | " new_s,r,done,info = env.step(a)\n", 271 | " \n", 272 | " #train agent one step. Note that we use one-element arrays instead of scalars \n", 273 | " #because that's what function accepts.\n", 274 | " train_step([s],[a],[r],[new_s],[done])\n", 275 | " \n", 276 | " total_reward+=r\n", 277 | " \n", 278 | " s = new_s\n", 279 | " if done: break\n", 280 | " \n", 281 | " return total_reward\n", 282 | " " 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": true 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "for i in range(100):\n", 294 | " \n", 295 | " rewards = [generate_session() for _ in range(100)] #generate new sessions\n", 296 | " \n", 297 | " epsilon*=0.95\n", 298 | " \n", 299 | " print (\"mean reward:%.3f\\tepsilon:%.5f\"%(np.mean(rewards),epsilon))\n", 300 | "\n", 301 | " if np.mean(rewards) > 300:\n", 302 | " print (\"You Win!\")\n", 303 | " break\n", 304 | " \n", 305 | " assert epsilon!=0, \"Please explore environment\"" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Video" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "epsilon=0 #Don't forget to reset epsilon back to initial value if you want to go on training" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "collapsed": true 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "#record sessions\n", 335 | "import gym.wrappers\n", 336 | "\n", 337 | "env = gym.wrappers.Monitor(gym.make(\"CartPole-v0\"),directory=\"videos\",force=True)\n", 338 | "sessions = [generate_session() for _ in range(100)]\n", 339 | "env.close()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "#show video\n", 351 | "from IPython.display import HTML\n", 352 | "import os\n", 353 | "\n", 354 | "video_names = list(filter(lambda s:s.endswith(\".mp4\"),os.listdir(\"./videos/\")))\n", 355 | "\n", 356 | "HTML(\"\"\"\n", 357 | "\n", 360 | "\"\"\".format(\"./videos/\"+video_names[-1])) #this may or may not be _last_ video. Try other indices" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": { 367 | "collapsed": true 368 | }, 369 | "outputs": [], 370 | "source": [] 371 | } 372 | ], 373 | "metadata": { 374 | "kernelspec": { 375 | "display_name": "Python 3", 376 | "language": "python", 377 | "name": "python3" 378 | }, 379 | "language_info": { 380 | "codemirror_mode": { 381 | "name": "ipython", 382 | "version": 3 383 | }, 384 | "file_extension": ".py", 385 | "mimetype": "text/x-python", 386 | "name": "python", 387 | "nbconvert_exporter": "python", 388 | "pygments_lexer": "ipython3", 389 | "version": "3.6.2" 390 | } 391 | }, 392 | "nbformat": 4, 393 | "nbformat_minor": 1 394 | } 395 | -------------------------------------------------------------------------------- /week5_explore/README.md: -------------------------------------------------------------------------------- 1 | ### Slides - [here](https://yadi.sk/i/H0zVBROe3TWWHz) 2 | 3 | ## Exploration and exploitation 4 | * [__main__] David Silver lecture on exploration and expoitation - [video](https://www.youtube.com/watch?v=sGuiWX07sKw) 5 | * Alternative lecture by J. Schulman - [video](https://www.youtube.com/watch?v=SfCa1HQMkuw) 6 | * Alternative lecture by N. de Freitas (with bayesian opt) - [video](https://www.youtube.com/watch?v=vz3D36VXefI) 7 | * Our lectures (russian) 8 | - "mathematical" lecture (by Alexander Vorobev) '17 - [slides](https://yadi.sk/i/JAeItALT3JmvCL), [video](https://yadi.sk/i/bVHmu9gt3Hi9Ym) 9 | - "engineering" lecture '18 - [video](https://yadi.sk/i/_myWJ13O3TdzXo) 10 | 11 | 12 | 13 | ## More materials 14 | * Gittins Index - the less heuristical approach to bandit exploration - [article](http://www.ece.mcgill.ca/~amahaj1/projects/bandits/book/2013-bandit-computations.pdf) 15 | * "Deep" version: variational information maximizing exploration - [video](https://www.youtube.com/watch?v=sRIjxxjVrnY) 16 | * Same topics in russian - [video](https://yadi.sk/i/_2_0yqeW3HDbcn) 17 | * Lecture covering intrinsically motivated reinforcement learning - [video](https://www.youtube.com/watch?v=aJI_9SoBDaQ) 18 | * [Slides](https://yadi.sk/i/8sx42nau3HEYKg) 19 | * Same topics in russian - [video](https://www.youtube.com/watch?v=WCE9hhPbCmc) 20 | * Note: UCB-1 is not for bernoulli rewards, but for arbitrary r in [0,1], so you can just scale any reward to [0,1] to obtain a peace of mind. It's derived directly from Hoeffding's inequality. 21 | 22 | ## Seminar 23 | In this seminar, you'll be solvilg basic and contextual bandits with uncertainty-based exploration like Bayesian UCB and Thompson Sampling. 24 | 25 | You will also need Bayesian Neural Networks. You will need theano/lasagne for this one: 26 | ``` 27 | # either 28 | conda install Theano 29 | # or 30 | pip install --upgrade https://github.com/Theano/Theano/archive/master.zip 31 | # and then lasagne 32 | pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip 33 | ``` 34 | 35 | Everything else is in the notebook :) 36 | -------------------------------------------------------------------------------- /week5_explore/action_rewards.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week5_explore/action_rewards.npy -------------------------------------------------------------------------------- /week5_explore/all_states.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week5_explore/all_states.npy -------------------------------------------------------------------------------- /week5_explore/bayes.py: -------------------------------------------------------------------------------- 1 | """ 2 | A single-file module that makes your lasagne network into a bayesian neural net. 3 | Originally created by github.com/ferrine , rewritten by github.com/justheuristic for simplicity 4 | 5 | See example in the notebook 6 | """ 7 | 8 | import numpy as np 9 | 10 | from theano import tensor as T 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 12 | 13 | import lasagne 14 | from lasagne import init 15 | from lasagne.random import get_rng 16 | 17 | from functools import wraps 18 | 19 | __all__ = ['NormalApproximation','get_var_cost','bbpwrap'] 20 | 21 | 22 | 23 | class NormalApproximation(object): 24 | def __init__(self, mu=0, std=np.exp(-3),seed=None): 25 | """ 26 | Approximation that samples network weights from factorized normal distribution. 27 | 28 | :param mu: prior mean for gaussian weights 29 | :param std: prior std for gaussian weights 30 | :param seed: random seed 31 | """ 32 | self.prior_mu = mu 33 | self.prior_std = std 34 | self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579)) 35 | 36 | def log_normal(self,x, mean, std, eps=0.0): 37 | """computes log-proba of normal distribution""" 38 | std += eps 39 | return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - (x - mean) ** 2 / (2 * std ** 2) 40 | 41 | def log_prior(self, weights): 42 | """ 43 | Logarithm of prior probabilities for weights: 44 | log P(weights) aka log P(theta) 45 | """ 46 | return self.log_normal(weights, self.prior_mu, self.prior_std) 47 | 48 | def log_posterior_approx(self,weights, mean, rho): 49 | """ 50 | Logarithm of ELBO on posterior probabilities: 51 | log q(weights|learned mu and rho) aka log q(theta|x) 52 | """ 53 | std = T.log1p(T.exp(rho)) #rho to std 54 | return self.log_normal(weights, mean, std) 55 | 56 | def __call__(self, layer, spec, shape, name=None, **tags): 57 | # case when user uses default init specs 58 | assert tags.get('variational',False) == True, "Please declare param as variational to avoid confusion" 59 | 60 | if not isinstance(spec, dict): 61 | initial_rho = np.log(np.expm1(self.prior_std)) #std to rho 62 | assert np.isfinite(initial_rho),"too small std to initialize correctly. Please pass explicit"\ 63 | " initializer (dict with {'mu':mu_init, 'rho':rho_init})." 64 | spec = {'mu': spec,'rho':init.Constant(initial_rho)} 65 | 66 | 67 | mu_spec,rho_spec = spec['mu'],spec['rho'] 68 | 69 | rho = layer.add_param(rho_spec, shape,name=(name or 'unk')+'.rho', **tags) 70 | mean = layer.add_param(mu_spec, shape,name=(name or 'unk')+'.mu', **tags) 71 | 72 | #Reparameterization trick 73 | e = self.srng.normal(shape, std=1) 74 | W = mean + T.log1p(T.exp(rho)) * e 75 | 76 | #KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka variational cost 77 | q_p = T.sum(self.log_posterior_approx(W, mean, rho) - self.log_prior(W)) 78 | 79 | #accumulate variational cost 80 | layer._bbwrap_var_cost += q_p 81 | return W 82 | 83 | 84 | 85 | def get_var_cost(layer_or_layers,treat_as_input=None): 86 | """ 87 | Returns total variational cost aka KL(q(theta|x)||p(theta)) for all layers in the network 88 | 89 | :param layer_or_layers: top layer(s) of your network, just like with lasagne.layers.get_output 90 | :param treat_as_input: don't accumulate over layers below these layers. See same param for lasagne.layers.get_all_layers 91 | 92 | Alternatively, one can manually get weights for one layer via layer.get_var_cost() 93 | """ 94 | cost = 0 95 | for layer in lasagne.layers.get_all_layers(layer_or_layers,treat_as_input): 96 | if hasattr(layer, 'get_var_cost'): #if layer is bayesian or pretends so 97 | cost += layer.get_var_cost() 98 | return cost 99 | 100 | def bbpwrap(approximation=NormalApproximation()): 101 | """ 102 | A decorator that makes arbitrary lasagne layer into a bayesian network layer: 103 | BayesDenseLayer = bbwrap()(DenseLayer) 104 | or more verbosely, 105 | @bbpwrap(NormalApproximation(pstd=0.01)) 106 | BayesDenseLayer(DenseLayer): 107 | pass 108 | 109 | """ 110 | 111 | def decorator(cls): 112 | def add_param_wrap(add_param): 113 | @wraps(add_param) 114 | def wrapped(self, spec, shape, name=None, **tags): 115 | # we should take care about some user specification 116 | # to avoid bbp hook just set tags['variational'] = True 117 | if not tags.get('trainable', True) or tags.get('variational', False): 118 | return add_param(self, spec, shape, name, **tags) 119 | else: 120 | # we declare that params we add next 121 | # are the ones we need to fit the distribution 122 | # they don't need to be regularized, strictly 123 | tags['variational'] = True 124 | tags['regularizable'] = False 125 | param = self.approximation(self, spec, shape, name, **tags) 126 | return param 127 | return wrapped 128 | 129 | def get_var_cost(self): 130 | """ 131 | Returns total variational cost aka KL(q(theta|x)||p(theta)) for this layer. 132 | Alternatively, use function get_var_cost(layer) to get total cost for all layers below this one. 133 | """ 134 | return self._bbwrap_var_cost 135 | 136 | 137 | cls.approximation = approximation 138 | cls._bbwrap_var_cost=0 139 | cls.add_param = add_param_wrap(cls.add_param) 140 | cls.get_var_cost = get_var_cost 141 | return cls 142 | 143 | 144 | return decorator 145 | -------------------------------------------------------------------------------- /week5_explore/bnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week5_explore/bnn.png -------------------------------------------------------------------------------- /week5_explore/river_swim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week5_explore/river_swim.png -------------------------------------------------------------------------------- /week6_policy_based/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [Slides](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture6.pdf&name=lecture6.pdf&c=58c876c4863a) 3 | * Video lecture by D. Silver - [video](https://www.youtube.com/watch?v=KHZVXao4qXs) 4 | * Our [lecture](https://yadi.sk/i/yPIPkO_f3TPsNK), [seminar(pytorch)](https://yadi.sk/i/flW8ezGk3TPsQ5), [seminar(theano)](https://yadi.sk/i/8f9NX_E73GKBkT) 5 | * Alternative lecture by J. Schulman part 1 - [video](https://www.youtube.com/watch?v=BB-BhTn6DCM) 6 | * Alternative lecture by J. Schulman part 2 - [video](https://www.youtube.com/watch?v=Wnl-Qh2UHGg) 7 | 8 | 9 | ## More materials 10 | * Actually proving the policy gradient for discounted rewards - [article](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) 11 | * On variance of policy gradient and optimal baselines: [article](https://papers.nips.cc/paper/4264-analysis-and-improvement-of-policy-gradient-estimation.pdf), another [article](https://arxiv.org/pdf/1301.2315.pdf) 12 | * Generalized Advantage Estimation - a way you can speed up training for homework_*.ipynb - [article](https://arxiv.org/abs/1506.02438) 13 | 14 | 15 | * Generalizing log-derivative trick - [url](http://blog.shakirm.com/2015/11/machine-learning-trick-of-the-day-5-log-derivative-trick/) 16 | * Combining policy gradient and q-learning - [arxiv](https://arxiv.org/abs/1611.01626) 17 | * Bayesian perspective on why reparameterization & logderivative tricks matter (Vetrov's take) - [pdf](https://www.sdsj.ru/slides/Vetrov.pdf) 18 | * Adversarial review of policy gradient - [blog](http://www.argmin.net/2018/02/20/reinforce/) 19 | 20 | 21 | ## Homework 22 | 23 | As usual, pick reinfoce_.ipynb for starters and then proceed with homework_.ipynb. 24 | 25 | -------------------------------------------------------------------------------- /week6_policy_based/atari_util.py: -------------------------------------------------------------------------------- 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient""" 2 | import numpy as np 3 | import gym 4 | from scipy.misc import imresize 5 | from gym.core import Wrapper 6 | from gym.spaces.box import Box 7 | 8 | class PreprocessAtari(Wrapper): 9 | def __init__(self, env, height=42, width=42, color=False, crop=lambda img: img, 10 | n_frames=4, dim_order='theano', reward_scale=1,): 11 | """A gym wrapper that reshapes, crops and scales image into the desired shapes""" 12 | super(PreprocessAtari, self).__init__(env) 13 | assert dim_order in ('theano', 'tensorflow') 14 | self.img_size = (height, width) 15 | self.crop=crop 16 | self.color=color 17 | self.dim_order = dim_order 18 | self.reward_scale = reward_scale 19 | 20 | n_channels = (3 * n_frames) if color else n_frames 21 | obs_shape = [n_channels,height,width] if dim_order == 'theano' else [height,width,n_channels] 22 | self.observation_space = Box(0.0, 1.0, obs_shape) 23 | self.framebuffer = np.zeros(obs_shape, 'float32') 24 | 25 | def reset(self): 26 | """resets breakout, returns initial frames""" 27 | self.framebuffer = np.zeros_like(self.framebuffer) 28 | self.update_buffer(self.env.reset()) 29 | return self.framebuffer 30 | 31 | def step(self,action): 32 | """plays breakout for 1 step, returns frame buffer""" 33 | new_img, reward, done, info = self.env.step(action) 34 | self.update_buffer(new_img) 35 | return self.framebuffer, reward * self.reward_scale, done, info 36 | 37 | ### image processing ### 38 | 39 | def update_buffer(self,img): 40 | img = self.preproc_image(img) 41 | offset = 3 if self.color else 1 42 | if self.dim_order == 'theano': 43 | axis = 0 44 | cropped_framebuffer = self.framebuffer[:-offset] 45 | else: 46 | axis = -1 47 | cropped_framebuffer = self.framebuffer[:,:,:-offset] 48 | self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis) 49 | 50 | def preproc_image(self, img): 51 | """what happens to the observation""" 52 | img = self.crop(img) 53 | img = imresize(img, self.img_size) 54 | if not self.color: 55 | img = img.mean(-1, keepdims=True) 56 | if self.dim_order == 'theano': 57 | img = img.transpose([2,0,1]) # [h, w, c] to [c, h, w] 58 | img = img.astype('float32') / 255. 59 | return img 60 | -------------------------------------------------------------------------------- /week7_[recap]_rnn/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [Slides](https://yadi.sk/i/-Iqdhg483GDyoN) 3 | * CS231 lecture on RNNs - [video](https://www.youtube.com/watch?v=iX5V1WpxxkY) 4 | * Our [lecture](https://yadi.sk/i/XHmT5hO53GcCKV), [seminar(pytorch)](https://yadi.sk/i/nCch5I8S3TsXh5), [seminar(theano)](https://yadi.sk/i/19twHESN3GcGKQ) (both russian) 5 | * [alternative] Brief lecture on RNN by nervana - [video](https://www.youtube.com/watch?v=Ukgii7Yd_cU) 6 | * [alternative] More detailed lecture by Y. Bengio - [video](https://www.youtube.com/watch?v=xK-bzjIQkmM) 7 | * Great reading by Karpathy - [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) 8 | * LSTM explained in detail by colah - [blog post](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) 9 | 10 | ## More materials 11 | * Seq2seq lecture - [video](https://www.youtube.com/watch?v=G5RY_SUJih4) 12 | * "Awesome rnn" entry point - [repo](https://github.com/kjw0612/awesome-rnn) 13 | * OpenAI research on sentiment analysis that sheds some light on what's inside LSTM language model. 14 | 15 | # Homework description 16 | 17 | This week's practice gets you acquainted with basics of recurrent neural networks. For simplicity, we'll train them on character language modelling task. Pick any one of `seminar_lasagne`, `seminar_lasagne_ingraph` or `seminar_tf`. 18 | 19 | As for difference btwn `seminar_lasagne` and `seminar_lasagne_ingraph` - ingraph version shows a lower-level interface to recurrent neural networks. It also requires you to install `pip install https://github.com/yandexdataschool/agentnet/archive/master.zip`. Out-of-graph version cover higher-level syntax from native lasagne. 20 | -------------------------------------------------------------------------------- /week7_[recap]_rnn/rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week7_[recap]_rnn/rnn.png -------------------------------------------------------------------------------- /week7_pomdp/README.md: -------------------------------------------------------------------------------- 1 | # Materials 2 | [lecture slides](https://yadi.sk/d/RGx8BUCr3Gq6DC) 3 | 4 | _Links on all articles mentioned during the lecture could be found in "References" at the very end of the presentation slides. All other interesing links which contribute to the topic of POMDP are presented below_ 5 | 6 | ## Basics 7 | * Our [lecture](https://yadi.sk/i/AHzpTjiT3U8L8e) and [seminar](https://yadi.sk/i/Ka-I7nBp3U8LAG) (russian) 8 | * A lecture on basics by Andrew NG (english, LQ) - [video](https://www.youtube.com/watch?v=yCqPMD6coO8) 9 | * A lecture on basics by 5vision (russian) - [video](https://www.youtube.com/watch?v=_dkaynuKUFE) 10 | * _[alternative]_ Chalkboard-style 2-part lecture by B. Ravindran. - [part1](https://www.youtube.com/watch?v=9G_KevA8DFY), [part2](https://www.youtube.com/watch?v=dMOUp7YzUpQ) 11 | * _[alternative]_ Yet another mini-lecture touching on POMDP by S.S. Baveja - [video](https://www.youtube.com/watch?v=SE56KgF7aVc) 12 | 13 | ## POMDP Learning 14 | * DRQN lecture by Fritz448 (russian) - [video](https://www.youtube.com/watch?v=bE5DIJvZexc) 15 | * [Data efficient learning in continous POMDP](https://arxiv.org/abs/1602.02523v1) 16 | * [Managing wind farms with bayesian POMDP](http://ascelibrary.org/doi/abs/10.1061/(ASCE)CP.1943-5487.0000390) 17 | * [Bayesian learning and decision-making in dynamic environments](http://www.jmlr.org/papers/volume12/ross11a/ross11a.pdf) 18 | 19 | 20 | 21 | 22 | --- 23 | 24 | # Practice 25 | 26 | 27 | The assignment is platform and framewerk independent, so choose the framework that suits you best, but pay attention on how many you will need to implement youself in case of nonstandart ones. 28 | -------------------------------------------------------------------------------- /week7_pomdp/atari_util.py: -------------------------------------------------------------------------------- 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient""" 2 | import numpy as np 3 | import gym 4 | from scipy.misc import imresize 5 | from gym.core import Wrapper 6 | from gym.spaces.box import Box 7 | 8 | class PreprocessAtari(Wrapper): 9 | def __init__(self, env, height=42, width=42, color=False, 10 | crop=lambda img: img, n_frames=4, dim_order='theano'): 11 | """A gym wrapper that reshapes, crops and scales image into the desired shapes""" 12 | super(PreprocessAtari, self).__init__(env) 13 | assert dim_order in ('theano', 'tensorflow') 14 | self.img_size = (height,width) 15 | self.crop=crop 16 | self.color=color 17 | self.dim_order = dim_order 18 | 19 | n_channels = (3 * n_frames) if color else n_frames 20 | obs_shape = [n_channels,height,width] if dim_order == 'theano' else [height,width,n_channels] 21 | self.observation_space = Box(0.0, 1.0, obs_shape) 22 | self.framebuffer = np.zeros(obs_shape, 'float32') 23 | 24 | def reset(self): 25 | """resets breakout, returns initial frames""" 26 | self.framebuffer = np.zeros_like(self.framebuffer) 27 | self.update_buffer(self.env.reset()) 28 | return self.framebuffer 29 | 30 | def step(self,action): 31 | """plays breakout for 1 step, returns frame buffer""" 32 | new_img,r,done,info = self.env.step(action) 33 | self.update_buffer(new_img) 34 | return self.framebuffer,r,done,info 35 | 36 | ### image processing ### 37 | 38 | def update_buffer(self,img): 39 | img = self.preproc_image(img) 40 | offset = 3 if self.color else 1 41 | if self.dim_order == 'theano': 42 | axis = 0 43 | cropped_framebuffer = self.framebuffer[:-offset] 44 | else: 45 | axis = -1 46 | cropped_framebuffer = self.framebuffer[:,:,:-offset] 47 | self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis) 48 | 49 | def preproc_image(self, img): 50 | """what happens to the observation""" 51 | img = self.crop(img) 52 | img = imresize(img, self.img_size) 53 | if not self.color: 54 | img = img.mean(-1, keepdims=True) 55 | if self.dim_order == 'theano': 56 | img = img.transpose([2,0,1]) # [h, w, c] to [c, h, w] 57 | img = img.astype('float32')/255. 58 | return img 59 | -------------------------------------------------------------------------------- /week7_pomdp/env_pool.py: -------------------------------------------------------------------------------- 1 | """ 2 | A thin wrapper for openAI gym environments that maintains a set of parallel games and has a method to generate 3 | interaction sessions given agent one-step applier function. 4 | """ 5 | 6 | import numpy as np 7 | 8 | # A whole lot of space invaders 9 | class EnvPool(object): 10 | def __init__(self, agent, make_env, n_parallel_games=1): 11 | """ 12 | A special class that handles training on multiple parallel sessions 13 | and is capable of some auxilary actions like evaluating agent on one game session (See .evaluate()). 14 | 15 | :param agent: Agent which interacts with the environment. 16 | :param make_env: Factory that produces environments OR a name of the gym environment. 17 | :param n_games: Number of parallel games. One game by default. 18 | :param max_size: Max pool size by default (if appending sessions). By default, pool is not constrained in size. 19 | """ 20 | # Create atari games. 21 | self.agent = agent 22 | self.make_env = make_env 23 | self.envs = [self.make_env() for _ in range(n_parallel_games)] 24 | 25 | # Initial observations. 26 | self.prev_observations = [env.reset() for env in self.envs] 27 | 28 | # Agent memory variables (if you use recurrent networks). 29 | self.prev_memory_states = agent.get_initial_state(n_parallel_games) 30 | 31 | # Whether particular session has just been terminated and needs restarting. 32 | self.just_ended = [False] * len(self.envs) 33 | 34 | def interact(self, n_steps=100, verbose=False): 35 | """Generate interaction sessions with ataries (openAI gym atari environments) 36 | Sessions will have length n_steps. Each time one of games is finished, it is immediately getting reset 37 | and this time is recorded in is_alive_log (See returned values). 38 | 39 | :param n_steps: Length of an interaction. 40 | :returns: observation_seq, action_seq, reward_seq, is_alive_seq 41 | :rtype: a bunch of tensors [batch, tick, ...] 42 | """ 43 | 44 | def env_step(i, action): 45 | if not self.just_ended[i]: 46 | new_observation, cur_reward, is_done, info = self.envs[i].step(action) 47 | if is_done: 48 | # Game ends now, will finalize on next tick. 49 | self.just_ended[i] = True 50 | 51 | # note: is_alive=True in any case because environment is still alive (last tick alive) in our notation. 52 | return new_observation, cur_reward, True, info 53 | else: 54 | # Reset environment, get new observation to be used on next tick. 55 | new_observation = self.envs[i].reset() 56 | 57 | # Reset memory for new episode. 58 | initial_memory_state = self.agent.get_initial_state(batch_size=1) 59 | for m_i in range(len(new_memory_states)): 60 | new_memory_states[m_i][i] = initial_memory_state[m_i][0] 61 | 62 | if verbose: 63 | print("env %i reloaded" % i) 64 | 65 | self.just_ended[i] = False 66 | 67 | return new_observation, 0, False, {'end': True} 68 | 69 | history_log = [] 70 | 71 | for i in range(n_steps - 1): 72 | new_memory_states, readout = self.agent.step(self.prev_memory_states, self.prev_observations) 73 | actions = self.agent.sample_actions(readout) 74 | 75 | new_observations, cur_rewards, is_alive, infos = zip(*map(env_step, range(len(self.envs)), actions)) 76 | 77 | # Append data tuple for this tick. 78 | history_log.append((self.prev_observations, actions, cur_rewards, is_alive)) 79 | 80 | self.prev_observations = new_observations 81 | self.prev_memory_states = new_memory_states 82 | 83 | #add last observation 84 | dummy_actions = [0] * len(self.envs) 85 | dummy_rewards = [0] * len(self.envs) 86 | dummy_mask = [1] * len(self.envs) 87 | history_log.append((self.prev_observations, dummy_actions, dummy_rewards, dummy_mask)) 88 | 89 | # cast to numpy arrays, transpose from [time, batch, ...] to [batch, time, ...] 90 | history_log = [np.array(tensor).swapaxes(0, 1) for tensor in zip(*history_log)] 91 | observation_seq, action_seq, reward_seq, is_alive_seq = history_log 92 | 93 | return observation_seq, action_seq, reward_seq, is_alive_seq -------------------------------------------------------------------------------- /week7_pomdp/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week7_pomdp/img1.jpg -------------------------------------------------------------------------------- /week7_pomdp/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week7_pomdp/img2.jpg -------------------------------------------------------------------------------- /week7_pomdp/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week7_pomdp/img3.jpg -------------------------------------------------------------------------------- /week8_scst/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [Slides](https://yadi.sk/i/2oUkKL8m3UFFe8) 3 | * Our [lecture](https://yadi.sk/i/hmjUfKht3UNCSq) & [seminar](https://yadi.sk/i/dTkWTFNj3UNCTv) (russian) 4 | * English lectures 5 | * Lecture by Mohammad Norouzi - [cs294 video](https://www.youtube.com/watch?v=fZNyHoXgV7M&index=24&list=PLkFD6_40KJIwTmSbCv9OVJB3YaO4sFwkX) 6 | * Optional lecture on conversation systems - [video](https://www.youtube.com/watch?v=2tKNpzUvDc4 ) 7 | * Will hopefully record our lecture in english soon! 8 | * Self-critical sequence traning [original article](https://arxiv.org/abs/1612.00563) 9 | 10 | ## Practice 11 | As usual, go to practice_{your framework}.ipynb above and follow instructions from there. [pytorch](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/practice_torch.ipynb), [tensorflow](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/practice_tf.ipynb), [theano](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/practice_theano.ipynb) 12 | 13 | Binder quickstart (lasts 1 hour): [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/yandexdataschool/Practical_RL/master) 14 | 15 | ## More materials 16 | * An [awesome post](http://distill.pub/2016/augmented-rnns/) explaining attention and long-term memory models. 17 | * [BLEU](http://www.aclweb.org/anthology/P02-1040.pdf) and [CIDEr](https://arxiv.org/pdf/1411.5726.pdf) articles. 18 | * Image captioning 19 | * MSCOCO captioning [challenge](http://mscoco.org/dataset/#captions-challenge2015) 20 | * Captioning baseline [notebook](https://github.com/yandexdataschool/HSE_deeplearning/blob/master/week7/captioning_solution_ars.ipynb) 21 | * Other articles on reinforcement learning for natural language: 22 | * [task-oriented conversation system](https://arxiv.org/abs/1703.07055) 23 | * [generating dialogues](https://arxiv.org/abs/1606.01541) 24 | * [sequential adversarial networks](https://arxiv.org/abs/1609.05473) (a.k.a. SeqGAN) 25 | * A large overview for machine translation (touching on RL, including RL failures) - [arxiv](https://arxiv.org/abs/1609.08144) 26 | * How _not_ to evaluate conversation models - [arxiv](https://arxiv.org/abs/1603.08023) 27 | * Overview of other non-games applications ("that article again") - [arxiv](https://arxiv.org/abs/1701.07274) 28 | 29 | -------------------------------------------------------------------------------- /week8_scst/basic_model_tf.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import keras.layers as L 3 | 4 | # This code implements a single-GRU seq2seq model. You will have to improve it later in the assignment. 5 | # Note 1: when using several recurrent layers TF can mixed up the weights of different recurrent layers. 6 | # In that case, make sure you both create AND use each rnn/gru/lstm/custom layer in a unique variable scope 7 | # e.g. with tf.variable_scope("first_lstm"): new_cell, new_out = self.lstm_1(...) 8 | # with tf.variable_scope("second_lstm"): new_cell2, new_out2 = self.lstm_2(...) 9 | # Note 2: everything you need for decoding should be stored in model state (output list of both encode and decode) 10 | # e.g. for attention, you should store all encoder sequence and input mask there in addition to lstm/gru states. 11 | 12 | class BasicTranslationModel: 13 | def __init__(self, name, inp_voc, out_voc, 14 | emb_size, hid_size,): 15 | 16 | self.name = name 17 | self.inp_voc = inp_voc 18 | self.out_voc = out_voc 19 | 20 | with tf.variable_scope(name): 21 | self.emb_inp = L.Embedding(len(inp_voc), emb_size) 22 | self.emb_out = L.Embedding(len(out_voc), emb_size) 23 | self.enc0 = tf.nn.rnn_cell.GRUCell(hid_size) 24 | self.dec_start = L.Dense(hid_size) 25 | self.dec0 = tf.nn.rnn_cell.GRUCell(hid_size) 26 | self.logits = L.Dense(len(out_voc)) 27 | 28 | 29 | # run on dummy output to .build all layers (and therefore create weights) 30 | inp = tf.placeholder('int32', [None, None]) 31 | out = tf.placeholder('int32', [None, None]) 32 | h0 = self.encode(inp) 33 | h1 = self.decode(h0,out[:,0]) 34 | # h2 = self.decode(h1,out[:,1]) etc. 35 | 36 | self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) 37 | 38 | 39 | def encode(self, inp, **flags): 40 | """ 41 | Takes symbolic input sequence, computes initial state 42 | :param inp: matrix of input tokens [batch, time] 43 | :return: a list of initial decoder state tensors 44 | """ 45 | inp_lengths = infer_length(inp, self.inp_voc.eos_ix) 46 | inp_emb = self.emb_inp(inp) 47 | 48 | _, enc_last = tf.nn.dynamic_rnn( 49 | self.enc0, inp_emb, 50 | sequence_length=inp_lengths, 51 | dtype = inp_emb.dtype) 52 | 53 | dec_start = self.dec_start(enc_last) 54 | return [dec_start] 55 | 56 | def decode(self, prev_state, prev_tokens, **flags): 57 | """ 58 | Takes previous decoder state and tokens, returns new state and logits 59 | :param prev_state: a list of previous decoder state tensors 60 | :param prev_tokens: previous output tokens, an int vector of [batch_size] 61 | :return: a list of next decoder state tensors, a tensor of logits [batch,n_tokens] 62 | """ 63 | 64 | [prev_dec] = prev_state 65 | 66 | prev_emb = self.emb_out(prev_tokens[:,None])[:,0] 67 | 68 | new_dec_out,new_dec_state = self.dec0(prev_emb, prev_dec) 69 | 70 | output_logits = self.logits(new_dec_out) 71 | 72 | return [new_dec_state], output_logits 73 | 74 | def symbolic_score(self, inp, out, eps=1e-30, **flags): 75 | """ 76 | Takes symbolic int32 matrices of hebrew words and their english translations. 77 | Computes the log-probabilities of all possible english characters given english prefices and hebrew word. 78 | :param inp: input sequence, int32 matrix of shape [batch,time] 79 | :param out: output sequence, int32 matrix of shape [batch,time] 80 | :return: log-probabilities of all possible english characters of shape [bath,time,n_tokens] 81 | 82 | NOTE: log-probabilities time axis is synchronized with out 83 | In other words, logp are probabilities of __current__ output at each tick, not the next one 84 | therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens) 85 | """ 86 | first_state = self.encode(inp,**flags) 87 | 88 | batch_size = tf.shape(inp)[0] 89 | bos = tf.fill([batch_size],self.out_voc.bos_ix) 90 | first_logits = tf.log(tf.one_hot(bos, len(self.out_voc)) + eps) 91 | 92 | def step(blob, y_prev): 93 | h_prev = blob[:-1] 94 | h_new, logits = self.decode(h_prev, y_prev, **flags) 95 | return list(h_new) + [logits] 96 | 97 | results = tf.scan(step,initializer=list(first_state)+[first_logits], 98 | elems=tf.transpose(out)) 99 | 100 | # gather state and logits, each of shape [time,batch,...] 101 | states_seq, logits_seq = results[:-1], results[-1] 102 | 103 | # add initial state and logits 104 | logits_seq = tf.concat((first_logits[None], logits_seq),axis=0) 105 | states_seq = [tf.concat((init[None], states), axis=0) 106 | for init, states in zip(first_state, states_seq)] 107 | 108 | #convert from [time,batch,...] to [batch,time,...] 109 | logits_seq = tf.transpose(logits_seq, [1, 0, 2]) 110 | states_seq = [tf.transpose(states, [1, 0] + list(range(2, states.shape.ndims))) 111 | for states in states_seq] 112 | 113 | return tf.nn.log_softmax(logits_seq) 114 | 115 | def symbolic_translate(self, inp, greedy=False, max_len = None, eps = 1e-30, **flags): 116 | """ 117 | takes symbolic int32 matrix of hebrew words, produces output tokens sampled 118 | from the model and output log-probabilities for all possible tokens at each tick. 119 | :param inp: input sequence, int32 matrix of shape [batch,time] 120 | :param greedy: if greedy, takes token with highest probablity at each tick. 121 | Otherwise samples proportionally to probability. 122 | :param max_len: max length of output, defaults to 2 * input length 123 | :return: output tokens int32[batch,time] and 124 | log-probabilities of all tokens at each tick, [batch,time,n_tokens] 125 | """ 126 | first_state = self.encode(inp, **flags) 127 | 128 | batch_size = tf.shape(inp)[0] 129 | bos = tf.fill([batch_size],self.out_voc.bos_ix) 130 | first_logits = tf.log(tf.one_hot(bos, len(self.out_voc)) + eps) 131 | max_len = tf.reduce_max(tf.shape(inp)[1])*2 132 | 133 | def step(blob,t): 134 | h_prev, y_prev = blob[:-2], blob[-1] 135 | h_new, logits = self.decode(h_prev, y_prev, **flags) 136 | y_new = tf.argmax(logits,axis=-1) if greedy else tf.multinomial(logits,1)[:,0] 137 | return list(h_new) + [logits, tf.cast(y_new,y_prev.dtype)] 138 | 139 | results = tf.scan(step, initializer=list(first_state) + [first_logits, bos], 140 | elems=[tf.range(max_len)]) 141 | 142 | # gather state, logits and outs, each of shape [time,batch,...] 143 | states_seq, logits_seq, out_seq = results[:-2], results[-2], results[-1] 144 | 145 | # add initial state, logits and out 146 | logits_seq = tf.concat((first_logits[None],logits_seq),axis=0) 147 | out_seq = tf.concat((bos[None], out_seq), axis=0) 148 | states_seq = [tf.concat((init[None], states), axis=0) 149 | for init, states in zip(first_state, states_seq)] 150 | 151 | #convert from [time,batch,...] to [batch,time,...] 152 | logits_seq = tf.transpose(logits_seq, [1, 0, 2]) 153 | out_seq = tf.transpose(out_seq) 154 | states_seq = [tf.transpose(states, [1, 0] + list(range(2, states.shape.ndims))) 155 | for states in states_seq] 156 | 157 | return out_seq, tf.nn.log_softmax(logits_seq) 158 | 159 | 160 | 161 | ### Utility functions ### 162 | 163 | def initialize_uninitialized(sess = None): 164 | """ 165 | Initialize unitialized variables, doesn't affect those already initialized 166 | :param sess: in which session to initialize stuff. Defaults to tf.get_default_session() 167 | """ 168 | sess = sess or tf.get_default_session() 169 | global_vars = tf.global_variables() 170 | is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars]) 171 | not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f] 172 | 173 | if len(not_initialized_vars): 174 | sess.run(tf.variables_initializer(not_initialized_vars)) 175 | 176 | def infer_length(seq, eos_ix, time_major=False, dtype=tf.int32): 177 | """ 178 | compute length given output indices and eos code 179 | :param seq: tf matrix [time,batch] if time_major else [batch,time] 180 | :param eos_ix: integer index of end-of-sentence token 181 | :returns: lengths, int32 vector of shape [batch] 182 | """ 183 | axis = 0 if time_major else 1 184 | is_eos = tf.cast(tf.equal(seq, eos_ix), dtype) 185 | count_eos = tf.cumsum(is_eos,axis=axis,exclusive=True) 186 | lengths = tf.reduce_sum(tf.cast(tf.equal(count_eos,0),dtype),axis=axis) 187 | return lengths 188 | 189 | def infer_mask(seq, eos_ix, time_major=False, dtype=tf.float32): 190 | """ 191 | compute mask given output indices and eos code 192 | :param seq: tf matrix [time,batch] if time_major else [batch,time] 193 | :param eos_ix: integer index of end-of-sentence token 194 | :returns: mask, float32 matrix with '0's and '1's of same shape as seq 195 | """ 196 | axis = 0 if time_major else 1 197 | lengths = infer_length(seq, eos_ix, time_major=time_major) 198 | mask = tf.sequence_mask(lengths, maxlen=tf.shape(seq)[axis], dtype=dtype) 199 | if time_major: mask = tf.transpose(mask) 200 | return mask 201 | 202 | 203 | def select_values_over_last_axis(values, indices): 204 | """ 205 | Auxiliary function to select logits corresponding to chosen tokens. 206 | :param values: logits for all actions: float32[batch,tick,action] 207 | :param indices: action ids int32[batch,tick] 208 | :returns: values selected for the given actions: float[batch,tick] 209 | """ 210 | assert values.shape.ndims == 3 and indices.shape.ndims == 2 211 | batch_size, seq_len = tf.shape(indices)[0], tf.shape(indices)[1] 212 | batch_i = tf.tile(tf.range(0,batch_size)[:, None],[1,seq_len]) 213 | time_i = tf.tile(tf.range(0,seq_len)[None, :],[batch_size,1]) 214 | indices_nd = tf.stack([batch_i, time_i, indices], axis=-1) 215 | 216 | return tf.gather_nd(values,indices_nd) 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /week8_scst/basic_model_theano.py: -------------------------------------------------------------------------------- 1 | # code by https://github.com/deniskamazur 2 | 3 | from lasagne.layers import * 4 | import theano.tensor as T 5 | import theano 6 | 7 | from agentnet.memory import LSTMCell, GRUCell, AttentionLayer 8 | from agentnet import Recurrence 9 | from agentnet.learning.generic import get_mask_by_eos 10 | from agentnet.resolver import ProbabilisticResolver 11 | from agentnet.utils import reapply 12 | 13 | 14 | class BasicTranslationModel: 15 | def __init__(self, inp_voc, out_voc, emb_size, hid_size, **kwargs): 16 | """ 17 | A simple interface for mt 18 | :param emb_size: Embedding size 19 | :param hid_size: Number of LSTM units 20 | :param bidereactional: If the nLSTM layers should be bidirectional 21 | :param input_dropout: Dropout after embedding layer 22 | :param recurrent_dropout: Dropout after each LSTM iteration 23 | :param rdo_size: If int - use dense layer after neck in decoder, if none don't 24 | :param peepholes: http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-var-peepholes.png 25 | :param kwargs: recurrence flags 26 | """ 27 | self.inp_voc = inp_voc 28 | self.out_voc = out_voc 29 | # encode input sequence 30 | class encoder: 31 | # intput layers 32 | inp = InputLayer((None, None)) 33 | mask = ExpressionLayer(inp, lambda x: get_mask_by_eos(T.eq(x, self.out_voc.eos_ix))) 34 | 35 | # embed the tokens 36 | emb = EmbeddingLayer(inp, input_size=len(inp_voc), 37 | output_size=emb_size) 38 | 39 | rnn_fw = GRULayer(emb, num_units=hid_size, mask_input=mask, 40 | only_return_final=True) 41 | 42 | dec_start = DenseLayer(rnn_fw,hid_size,nonlinearity=None) 43 | 44 | # make encoder a public field 45 | self.encoder = encoder 46 | 47 | # decoder the encoded sequence 48 | class decoder: 49 | # decoder previous memory and tokens 50 | prev_hid = InputLayer((None, hid_size), name='prev hidden state') 51 | inp = InputLayer((None,), name="prev phoneme") 52 | 53 | emb = EmbeddingLayer(inp, len(out_voc), emb_size) 54 | 55 | new_hid = GRUCell(prev_hid, emb) 56 | 57 | logits = DenseLayer(new_hid, len(out_voc), nonlinearity=None) 58 | 59 | probs = NonlinearityLayer(logits, nonlinearity=T.nnet.softmax) 60 | logprobs = NonlinearityLayer(logits, nonlinearity=T.nnet.logsoftmax) 61 | out = ProbabilisticResolver(probs, assume_normalized=True) 62 | 63 | state_dict = { 64 | new_hid: prev_hid, 65 | # ^^^ this reads "at next step, new_hid will become prev_hid" 66 | # if you add any more recurrent memory units, 67 | # please make sure they're here 68 | } 69 | 70 | init_dict = { 71 | new_hid:encoder.dec_start 72 | # ^^^ this reads "before first step, new_hid is set to outputs of dec_start" 73 | # if you add any more recurrent memory units with non-zero init 74 | # please make sure they're here 75 | } 76 | 77 | nonseq_dict = { 78 | # here you can add anything encoder needs that's gonna be same across time-steps 79 | } 80 | 81 | self.decoder = decoder 82 | 83 | top_layers = [encoder.dec_start,decoder.out] + list(decoder.state_dict.keys()) 84 | self.weights = get_all_params(top_layers, trainable=True) 85 | 86 | def symbolic_score(self, inp, out, eps=1e-30, **flags): 87 | """ 88 | Takes symbolic int32 matrices of hebrew words and their english translations. 89 | Computes the log-probabilities of all possible english characters given english prefices and hebrew word. 90 | :param inp: input sequence, int32 matrix of shape [batch,time] 91 | :param out: output sequence, int32 matrix of shape [batch,time] 92 | :return: log-probabilities of all possible english characters of shape [bath,time,n_tokens] 93 | 94 | NOTE: log-probabilities time axis is synchronized with out 95 | In other words, logp are probabilities of __current__ output at each tick, not the next one 96 | therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens) 97 | """ 98 | 99 | l_output_sequence = InputLayer([None,None]) 100 | 101 | # Defining custom recurrent layer out of decoder 102 | rec = Recurrence( 103 | state_variables=self.decoder.state_dict, 104 | state_init=self.decoder.init_dict, 105 | input_sequences={self.decoder.inp:l_output_sequence}, 106 | input_nonsequences=self.decoder.nonseq_dict, 107 | tracked_outputs=self.decoder.logprobs, 108 | unroll_scan=False 109 | ) 110 | 111 | feed_dict = { 112 | self.encoder.inp: inp, 113 | l_output_sequence: out 114 | } 115 | logprobs = get_output(rec[self.decoder.logprobs], feed_dict, 116 | recurrence_flags=flags, **flags) 117 | 118 | self.auto_updates = rec.get_automatic_updates() 119 | if len(self.auto_updates) != 0: 120 | print("symbolic_score: Please collect auto_updates of random states " 121 | "after you called symbolic_score (available at model.auto_updates)!") 122 | 123 | 124 | first_logprobs = T.zeros_like(logprobs[:,:1]) 125 | logprobs = T.concatenate([first_logprobs,logprobs[:,:-1]],axis=1) 126 | 127 | return logprobs 128 | 129 | 130 | 131 | def symbolic_translate(self, inp, greedy=False, max_len = None, 132 | unroll_scan=False, eps = 1e-30, **flags): 133 | """ 134 | takes symbolic int32 matrix of hebrew words, produces output tokens sampled 135 | from the model and output log-probabilities for all possible tokens at each tick. 136 | :param inp: input sequence, int32 matrix of shape [batch,time] 137 | :param greedy: if greedy, takes token with highest probablity at each tick. 138 | Otherwise samples proportionally to probability. 139 | :param max_len: max length of output, defaults to 2 * input length 140 | :param unroll_scan: if True, compiles longer but runs faster. 141 | requires max_len to be constant 142 | :return: output tokens int32[batch,time] and 143 | log-probabilities of all tokens at each tick, [batch,time,n_tokens] 144 | """ 145 | if unroll_scan: 146 | assert isinstance(max_len,int), "if scan is unrolled, max_len must be a constant integer" 147 | 148 | max_len = max_len if max_len is not None else 2 * inp.shape[1] 149 | 150 | # initial output tokens (BOS) 151 | bos = T.zeros_like(inp[:, 0]) + self.out_voc.bos_ix 152 | l_start = InputLayer((None,),bos) 153 | 154 | # Defining custom recurrent layer out of decoder 155 | rec = Recurrence( 156 | state_variables=merge_dicts(self.decoder.state_dict, 157 | {self.decoder.out: self.decoder.inp}), 158 | state_init=merge_dicts(self.decoder.init_dict, {self.decoder.out: l_start}), 159 | input_nonsequences=self.decoder.nonseq_dict, 160 | tracked_outputs=(self.decoder.out, self.decoder.probs, self.decoder.logprobs), 161 | n_steps=max_len, 162 | unroll_scan=unroll_scan 163 | ) 164 | 165 | translations, logprobs = get_output(rec[self.decoder.out, self.decoder.logprobs], 166 | {self.encoder.inp:inp, 167 | l_start:bos}, 168 | recurrence_flags=dict(flags,greedy=greedy), 169 | **flags) 170 | 171 | self.auto_updates = rec.get_automatic_updates() 172 | if len(self.auto_updates) != 0: 173 | print("symbolic_translate: Please collect auto_updates of random states " 174 | "after you called symbolic_translate (available at model.auto_updates)!") 175 | 176 | # add first step (bos) 177 | translations = T.concatenate([bos[:,None],translations],axis=1) 178 | first_logprobs = T.zeros_like(logprobs[:,:1]) 179 | logprobs = T.concatenate([first_logprobs,logprobs],axis=1) 180 | 181 | return translations,logprobs 182 | 183 | 184 | def merge_dicts(*dicts, **kwargs): 185 | """ 186 | Melts several dicts into one. Useful when messing with feed dicts 187 | :param dicts: dictionaries 188 | :param check_conflicts: if True, raises error if several dicts have the same key 189 | Otherwise uses the key from the latest dict in *dicts 190 | :return: a dict that contains k-v pairs from all *dicts 191 | """ 192 | merged_dict = {} 193 | for d in dicts: 194 | merged_dict.update(d) 195 | if kwargs.get('check_conflicts'): 196 | assert len(merged_dict) == sum(map(len, dicts)), 'dicts have duplicate keys' 197 | return merged_dict 198 | 199 | -------------------------------------------------------------------------------- /week8_scst/basic_model_torch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | # Note: unlike official pytorch tutorial, this model doesn't process one sample at a time 7 | # because it's slow on GPU. instead it uses masks just like ye olde theano/tensorflow. 8 | # it doesn't use torch.nn.utils.rnn.pack_paded_sequence because reasons. 9 | 10 | class BasicTranslationModel(nn.Module): 11 | def __init__(self, inp_voc, out_voc, 12 | emb_size, hid_size,): 13 | super(self.__class__, self).__init__() 14 | self.inp_voc = inp_voc 15 | self.out_voc = out_voc 16 | 17 | self.emb_inp = nn.Embedding(len(inp_voc), emb_size) 18 | self.emb_out = nn.Embedding(len(out_voc), emb_size) 19 | self.enc0 = nn.GRU(emb_size, hid_size, batch_first=True) 20 | self.dec_start = nn.Linear(hid_size, hid_size) 21 | self.dec0 = nn.GRUCell(emb_size, hid_size) 22 | self.logits = nn.Linear(hid_size, len(out_voc)) 23 | 24 | def encode(self, inp, **flags): 25 | """ 26 | Takes symbolic input sequence, computes initial state 27 | :param inp: a vector of input tokens (Variable, int64, 1d) 28 | :return: a list of initial decoder state tensors 29 | """ 30 | inp_emb = self.emb_inp(inp) 31 | enc_seq, _ = self.enc0(inp_emb) 32 | 33 | # select last element w.r.t. mask 34 | end_index = infer_length(inp, self.inp_voc.eos_ix) 35 | end_index[end_index >= inp.shape[1]] = inp.shape[1] - 1 36 | enc_last = enc_seq[range(0, enc_seq.shape[0]), end_index.detach(), :] 37 | 38 | dec_start = self.dec_start(enc_last) 39 | return [dec_start] 40 | 41 | def decode(self, prev_state, prev_tokens, **flags): 42 | """ 43 | Takes previous decoder state and tokens, returns new state and logits 44 | :param prev_state: a list of previous decoder state tensors 45 | :param prev_tokens: previous output tokens, an int vector of [batch_size] 46 | :return: a list of next decoder state tensors, a tensor of logits [batch,n_tokens] 47 | """ 48 | [prev_dec] = prev_state 49 | 50 | prev_emb = self.emb_out(prev_tokens) 51 | new_dec_state = self.dec0(prev_emb, prev_dec) 52 | output_logits = self.logits(new_dec_state) 53 | 54 | return [new_dec_state], output_logits 55 | 56 | def forward(self, inp, out, eps=1e-30, **flags): 57 | """ 58 | Takes symbolic int32 matrices of hebrew words and their english translations. 59 | Computes the log-probabilities of all possible english characters given english prefices and hebrew word. 60 | :param inp: input sequence, int32 matrix of shape [batch,time] 61 | :param out: output sequence, int32 matrix of shape [batch,time] 62 | :return: log-probabilities of all possible english characters of shape [bath,time,n_tokens] 63 | 64 | Note: log-probabilities time axis is synchronized with out 65 | In other words, logp are probabilities of __current__ output at each tick, not the next one 66 | therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens) 67 | """ 68 | batch_size = inp.shape[0] 69 | bos = Variable(torch.LongTensor([self.out_voc.bos_ix] * batch_size)) 70 | logits_seq = [torch.log(to_one_hot(bos, len(self.out_voc)) + eps)] 71 | 72 | hid_state = self.encode(inp, **flags) 73 | for x_t in out.transpose(0,1)[:-1]: 74 | hid_state, logits = self.decode(hid_state, x_t, **flags) 75 | logits_seq.append(logits) 76 | 77 | return F.log_softmax(torch.stack(logits_seq, dim=1), dim=-1) 78 | 79 | def translate(self, inp, greedy=False, max_len = None, eps = 1e-30, **flags): 80 | """ 81 | takes symbolic int32 matrix of hebrew words, produces output tokens sampled 82 | from the model and output log-probabilities for all possible tokens at each tick. 83 | :param inp: input sequence, int32 matrix of shape [batch,time] 84 | :param greedy: if greedy, takes token with highest probablity at each tick. 85 | Otherwise samples proportionally to probability. 86 | :param max_len: max length of output, defaults to 2 * input length 87 | :return: output tokens int32[batch,time] and 88 | log-probabilities of all tokens at each tick, [batch,time,n_tokens] 89 | """ 90 | batch_size = inp.shape[0] 91 | bos = Variable(torch.LongTensor([self.out_voc.bos_ix] * batch_size)) 92 | mask = Variable(torch.ones(batch_size).type(torch.ByteTensor)) 93 | logits_seq = [torch.log(to_one_hot(bos, len(self.out_voc)) + eps)] 94 | out_seq = [bos] 95 | 96 | hid_state = self.encode(inp, **flags) 97 | while True: 98 | hid_state, logits = self.decode(hid_state, out_seq[-1], **flags) 99 | if greedy: 100 | _, y_t = torch.max(logits, dim=-1) 101 | else: 102 | probs = F.softmax(logits, dim=-1) 103 | y_t = torch.multinomial(probs, 1)[:, 0] 104 | 105 | logits_seq.append(logits) 106 | out_seq.append(y_t) 107 | mask &= y_t != self.out_voc.eos_ix 108 | 109 | if not mask.any(): break 110 | if max_len and len(out_seq) >= max_len: break 111 | 112 | return torch.stack(out_seq, 1), F.log_softmax(torch.stack(logits_seq, 1), dim=-1) 113 | 114 | 115 | 116 | ### Utility functions ### 117 | 118 | def infer_mask(seq, eos_ix, batch_first=True, include_eos=True, type=torch.FloatTensor): 119 | """ 120 | compute length given output indices and eos code 121 | :param seq: tf matrix [time,batch] if batch_first else [batch,time] 122 | :param eos_ix: integer index of end-of-sentence token 123 | :param include_eos: if True, the time-step where eos first occurs is has mask = 1 124 | :returns: lengths, int32 vector of shape [batch] 125 | """ 126 | assert seq.dim() == 2 127 | is_eos = (seq == eos_ix).type(torch.FloatTensor) 128 | if include_eos: 129 | if batch_first: 130 | is_eos = torch.cat((is_eos[:,:1]*0, is_eos[:, :-1]), dim=1) 131 | else: 132 | is_eos = torch.cat((is_eos[:1,:]*0, is_eos[:-1, :]), dim=0) 133 | count_eos = torch.cumsum(is_eos, dim=1 if batch_first else 0) 134 | mask = count_eos == 0 135 | return mask.type(type) 136 | 137 | def infer_length(seq, eos_ix, batch_first=True, include_eos=True, type=torch.LongTensor): 138 | """ 139 | compute mask given output indices and eos code 140 | :param seq: tf matrix [time,batch] if time_major else [batch,time] 141 | :param eos_ix: integer index of end-of-sentence token 142 | :param include_eos: if True, the time-step where eos first occurs is has mask = 1 143 | :returns: mask, float32 matrix with '0's and '1's of same shape as seq 144 | """ 145 | mask = infer_mask(seq, eos_ix, batch_first, include_eos, type) 146 | return torch.sum(mask, dim=1 if batch_first else 0) 147 | 148 | 149 | def to_one_hot(y, n_dims=None): 150 | """ Take integer y (tensor or variable) with n dims and convert it to 1-hot representation with n+1 dims. """ 151 | y_tensor = y.data if isinstance(y, Variable) else y 152 | y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1) 153 | n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1 154 | y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1) 155 | y_one_hot = y_one_hot.view(*y.shape, -1) 156 | return Variable(y_one_hot) if isinstance(y, Variable) else y_one_hot 157 | 158 | -------------------------------------------------------------------------------- /week8_scst/bonus.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Week8 bonus descriptions\n", 8 | "\n", 9 | "Here are some cool mini-projects you can try to dive deeper into the topic.\n", 10 | "\n", 11 | "## More metrics: BLEU (5+ pts)\n", 12 | "\n", 13 | "Pick BLEU or any other relevant metric, e.g. BLEU (e.g. from `nltk.bleu_score`).\n", 14 | "* Train model to maximize BLEU directly\n", 15 | "* How does levenshtein behave when maximizing BLEU and vice versa?\n", 16 | "* Compare this with how they behave when optimizing likelihood. \n", 17 | "\n", 18 | "(use default parameters for bleu: 4-gram, uniform weights)\n", 19 | "\n", 20 | "## Actor-critic (5+++ pts)\n", 21 | "\n", 22 | "While self-critical training provides a large reduction of gradient variance, it has a few drawbacks:\n", 23 | "- It requires a lot of additional computation during training\n", 24 | "- It doesn't adjust V(s) between decoder steps. (one value per sequence)\n", 25 | "\n", 26 | "There's a more general way of doing the same thing: learned baselines, also known as __advantage actor-critic__.\n", 27 | "\n", 28 | "There are two main ways to apply that:\n", 29 | "- __naive way__: compute V(s) once per training example.\n", 30 | " - This only requires additional 1-unit linear dense layer that grows out of encoder, estimating V(s)\n", 31 | " - (implement this to get main points)\n", 32 | "- __every step__: compute V(s) on each decoder step\n", 33 | " - Again it's just an 1-unit dense layer (no nonlinearity), but this time it's inside decoder recurrence.\n", 34 | " - (+3 pts additional for this guy)\n", 35 | "\n", 36 | "In both cases, you should train V(s) to minimize squared error $(V(s) - R(s,a))^2$ with R being actual levenshtein.\n", 37 | "You can then use $ A(s,a) = (R(s,a) - const(V(s))) $ for policy gradient.\n", 38 | "\n", 39 | "There's also one particularly interesting approach (+5 additional pts):\n", 40 | "- __combining SCST and actor-critic__:\n", 41 | " - compute baseline $V(s)$ via self-critical sequence training (just like in main assignment)\n", 42 | " - learn correction $ C(s,a_{:t}) = R(s,a) - V(s) $ by minimizing $(R(s,a) - V(s) - C(s,a_{:t}))^2 $\n", 43 | " - use $ A(s,a_{:t}) = R(s,a) - V(s) - const(C(s,a_{:t})) $\n", 44 | "\n", 45 | "\n", 46 | "\n", 47 | "## Implement attention (5+++ pts)\n", 48 | "\n", 49 | "Some seq2seq tasks can benefit from the attention mechanism. In addition to taking the _last_ time-step of encoder hidden state, we can allow decoder to peek on any time-step of his choice.\n", 50 | "\n", 51 | "![img](https://s30.postimg.org/f8um3kt5d/google_seq2seq_attention.gif)\n", 52 | "\n", 53 | "\n", 54 | "#### Recommended steps:\n", 55 | "__1)__ Modify encoder-decoder\n", 56 | "\n", 57 | "Learn to feed the entire encoder into the decoder. You can do so by sending encoder rnn layer directly into decoder (make sure there's no `only_return_final=True` for encoder rnn layer).\n", 58 | "\n", 59 | "```\n", 60 | "class decoder:\n", 61 | " ...\n", 62 | " encoder_rnn_input = InputLayer(encoder.rnn.output_shape, name='encoder rnn input for decoder')\n", 63 | " ...\n", 64 | " \n", 65 | "#decoder Recurrence\n", 66 | "rec = Recurrence(...,\n", 67 | " input_nonsequences = {decoder.encoder_rnn_input: encoder.rnn},\n", 68 | " )\n", 69 | "\n", 70 | "```\n", 71 | "\n", 72 | "For starters, you can take it's last tick (via SliceLayer) inside the decoder step and feed it as input to make sure it works.\n", 73 | "\n", 74 | "__2)__ Implement attention mechanism\n", 75 | "\n", 76 | "Next thing we'll need is to implement the math of attention.\n", 77 | "\n", 78 | "The simplest way to do so is to write a special layer. We gave you a prototype and some tests below.\n", 79 | "\n", 80 | "__3)__ Use attention inside decoder\n", 81 | "\n", 82 | "That's almost it! Now use `AttentionLayer` inside the decoder and feed it to back to lstm/gru/rnn (see code demo below).\n", 83 | "\n", 84 | "Train the full network just like you did before attention.\n", 85 | "\n", 86 | "__More points__ will be awwarded for comparing learning results of attention Vs no attention.\n", 87 | "\n", 88 | "__Bonus bonus:__ visualize attention vectors (>= +3 points)\n", 89 | "\n", 90 | "The best way to make sure your attention actually works is to visualize it.\n", 91 | "\n", 92 | "A simple way to do so is to obtain attention vectors from each tick (values __right after softmax__, not the layer outputs) and drawing those as images.\n", 93 | "\n", 94 | "#### step-by-step guide:\n", 95 | "- split AttentionLayer into two layers: _\"from start to softmax\"_ and _\"from softmax to output\"_\n", 96 | "- add outputs of the first layer to recurrence's `tracked_outputs`\n", 97 | "- compile a function that computes them\n", 98 | "- plt.imshow(them)\n", 99 | "\n", 100 | "\n" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "import numpy as np\n", 112 | "import theano,lasagne\n", 113 | "import theano.tensor as T\n", 114 | "from lasagne import init\n", 115 | "from lasagne.layers import *" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "class AttentionLayer(MergeLayer):\n", 125 | " def __init__(self,decoder_h,encoder_rnn):\n", 126 | " #sanity checks\n", 127 | " assert len(decoder_h.output_shape)==2,\"please feed decoder 1 step activation as first param \"\n", 128 | " assert len(encoder_rnn.output_shape)==3, \"please feed full encoder rnn sequence as second param\"\n", 129 | " \n", 130 | " self.decoder_num_units = decoder_h.output_shape[-1]\n", 131 | " self.encoder_num_units = encoder.output_shape[-1]\n", 132 | "\n", 133 | " #Here you should initialize all trainable parameters.\n", 134 | " #\n", 135 | " \n", 136 | " #use this syntax:\n", 137 | " self.add_param(spec=init.Normal(std=0.01), #or other initializer\n", 138 | " shape=,\n", 139 | " name='')\n", 140 | " \n", 141 | " \n", 142 | " MergeLayer.__init__(self,[decoder_h,encoder_rnn],name=\"attention\")\n", 143 | " \n", 144 | " \n", 145 | " def get_output_shape_for(self,input_shapes,**kwargs):\n", 146 | " \"\"\"return matrix of shape [batch_size, encoder num units]\"\"\"\n", 147 | " return (None,self.encoder_num_units)\n", 148 | " \n", 149 | " def get_output_for(self,inputs,**kwargs):\n", 150 | " \"\"\"\n", 151 | " takes (decoder_h, encoder_seq)\n", 152 | " decoder_h has shape [batch_size, decoder num_units]\n", 153 | " encoder_seq has shape [batch_size, sequence_length, encoder num_units]\n", 154 | " \n", 155 | " returns attention output: matrix of shape [batch_size, encoder num units]\n", 156 | " \n", 157 | " please read comments carefully before you start implementing\n", 158 | " \"\"\"\n", 159 | " decoder_h,encoder_seq = inputs\n", 160 | " \n", 161 | " #get symbolic batch-size / seq length. Also don't forget self.decoder_num_units above\n", 162 | " batch_size,seq_length,_ = tuple(encoder_seq.shape)\n", 163 | " \n", 164 | " #here's a recommended step-by-step guide for attention mechanism. \n", 165 | " #You are free to ignore it alltogether if you so wish\n", 166 | " \n", 167 | " #we repeat decoder activations to allign with encoder\n", 168 | " decoder_h_repeated = \n", 170 | " \n", 171 | " # ^--shape=[batch,seq_length,decoder_n_units]\n", 172 | " \n", 173 | " encoder_and_decoder_together = \n", 174 | " # ^--shape=[batch,seq_length,enc_n_units+dec_n_units]\n", 175 | " \n", 176 | " #here we flatten the tensor to simplify\n", 177 | " encoder_and_decoder_flat = T.reshape(encoder_and_decoder_together,(-1,encoder_and_decoder_together.shape[-1]))\n", 178 | " # ^--shape=[batch*seq_length,enc_n_units+dec_n_units]\n", 179 | " \n", 180 | " #here you use encoder_and_decoder_flat and some learned weights to predict attention logits\n", 181 | " #don't use softmax yet\n", 182 | " \n", 183 | " attention_logits_flat = \n", 184 | " # ^--shape=[batch*seq_length,1]\n", 185 | " \n", 186 | " \n", 187 | " #here we reshape flat logits back into correct form\n", 188 | " assert attention_logits_flat.ndim==2\n", 189 | " attention_logits = attention_logits_flat.reshape((batch_size,seq_length))\n", 190 | " # ^--shape=[batch,seq_length]\n", 191 | " \n", 192 | " #here we apply softmax :)\n", 193 | " attention = T.nnet.softmax(attention_logits)\n", 194 | " # ^--shape=[batch,seq_length]\n", 195 | " \n", 196 | " #here we compute output\n", 197 | " output = (attention[:,:,None]*encoder_seq).sum(axis=1) #sum over seq_length\n", 198 | " # ^--shape=[batch,enc_n_units]\n", 199 | " \n", 200 | " return output\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "#demo code\n", 210 | "\n", 211 | "from numpy.random import randn\n", 212 | "\n", 213 | "dec_h_prev = InputLayer((None,50),T.constant(randn(5,50)),name='decoder h mock')\n", 214 | "\n", 215 | "enc = InputLayer((None,None,32),T.constant(randn(5,20,32)),name='encoder sequence mock')\n", 216 | "\n", 217 | "attention = AttentionLayer(dec_h_prev,enc)\n", 218 | "\n", 219 | "#now you can use attention as additonal input to your decoder\n", 220 | "#LSTMCell(prev_cell,prev_out,input_or_inputs=(usual_input,attention))\n", 221 | "\n", 222 | "\n", 223 | "#sanity check\n", 224 | "demo_output = get_output(attention).eval()\n", 225 | "print 'actual shape:',demo_output.shape\n", 226 | "assert demo_output.shape == (5,32)\n", 227 | "assert np.isfinite(demo_output)\n", 228 | "\n" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [] 239 | } 240 | ], 241 | "metadata": { 242 | "kernelspec": { 243 | "display_name": "Python 2", 244 | "language": "python", 245 | "name": "python2" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 2 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython2", 257 | "version": "2.7.13" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 2 262 | } 263 | -------------------------------------------------------------------------------- /week8_scst/voc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Vocab: 4 | def __init__(self, tokens, bos="__BOS__", eos="__EOS__", sep=''): 5 | """ 6 | A special class that handles tokenizing and detokenizing 7 | """ 8 | assert bos in tokens, eos in tokens 9 | self.tokens = tokens 10 | self.token_to_ix = {t:i for i,t in enumerate(tokens)} 11 | 12 | self.bos = bos 13 | self.bos_ix = self.token_to_ix[bos] 14 | self.eos = eos 15 | self.eos_ix = self.token_to_ix[eos] 16 | self.sep = sep 17 | 18 | def __len__(self): 19 | return len(self.tokens) 20 | 21 | @staticmethod 22 | def from_lines(lines, bos="__BOS__", eos="__EOS__", sep=''): 23 | flat_lines = sep.join(list(lines)) 24 | flat_lines = list(flat_lines.split(sep)) if sep != '' else list(flat_lines) 25 | tokens = list(set(sep.join(flat_lines))) 26 | tokens = [t for t in tokens if t not in (bos,eos) and len(t) != 0] 27 | tokens = [bos,eos] + tokens 28 | return Vocab(tokens,bos,eos,sep) 29 | 30 | def tokenize(self,string): 31 | """converts string to a list of tokens""" 32 | tokens = list(filter(len,string.split(self.sep))) \ 33 | if self.sep != '' else list(string) 34 | return [self.bos] + tokens + [self.eos] 35 | 36 | def to_matrix(self, lines, max_len=None): 37 | """ 38 | convert variable length token sequences into fixed size matrix 39 | example usage: 40 | >>>print( as_matrix(words[:3],source_to_ix)) 41 | [[15 22 21 28 27 13 -1 -1 -1 -1 -1] 42 | [30 21 15 15 21 14 28 27 13 -1 -1] 43 | [25 37 31 34 21 20 37 21 28 19 13]] 44 | """ 45 | max_len = max_len or max(map(len, lines)) + 2 # 2 for bos and eos 46 | 47 | matrix = np.zeros((len(lines), max_len), dtype='int32') + self.eos_ix 48 | for i, seq in enumerate(lines): 49 | tokens = self.tokenize(seq) 50 | row_ix = list(map(self.token_to_ix.get, tokens))[:max_len] 51 | matrix[i, :len(row_ix)] = row_ix 52 | 53 | return matrix 54 | 55 | def to_lines(self, matrix, crop=True): 56 | """ 57 | Convert matrix of token ids into strings 58 | :param matrix: matrix of tokens of int32, shape=[batch,time] 59 | :param crop: if True, crops BOS and EOS from line 60 | :return: 61 | """ 62 | lines = [] 63 | for line_ix in map(list,matrix): 64 | if crop: 65 | if line_ix[0] == self.bos_ix: 66 | line_ix = line_ix[1:] 67 | if self.eos_ix in line_ix: 68 | line_ix = line_ix[:line_ix.index(self.eos_ix)] 69 | line = self.sep.join(self.tokens[i] for i in line_ix) 70 | lines.append(line) 71 | return lines 72 | -------------------------------------------------------------------------------- /week9_policy_II/README.md: -------------------------------------------------------------------------------- 1 | * [__slides #1 (trpo)__](https://docs.google.com/presentation/d/15Z_AVBsO9VuOSZ5uY-Q4by3tHKiRSENchhAKHhCxIOc/present?token=AC4w5VgM6o7lCOmwtNFI3lfzyPv2PHOpRQ%3A1511795215658&includes_info_params=1#slide=id.g1d8d5bc58c_0_4) 2 | * [__slides #2 (dpg)__](https://yadi.sk/i/uV6IA-C23UTn7c) 3 | 4 | ## Materials 5 | This section covers some steroids for policy gradient methods, along with a cool general trick called 6 | 7 | * Lecture on NPG and TRPO by J. Schulman - [video](https://www.youtube.com/watch?v=_t5fpZuuf-4) 8 | * Alternative lecture on TRPO and open problems by... J. Schulman - [video](https://www.youtube.com/watch?v=gb5Q2XL5c8A) 9 | * Our videos: [lecture](https://yadi.sk/i/OP0B1BEj3UcmW9), [seminar(pytorch)](https://yadi.sk/i/D8mHrKM63UcmWh) [seminar(theano)](https://yadi.sk/i/b0ol2gUV3HiKKJ) (russian) 10 | * Original articles - [TRPO](https://arxiv.org/abs/1502.05477), [NPG](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf) 11 | 12 | ## Practice 13 | Go to `seminar_TRPO_.ipynb` and follow instructions in the notebook. 14 | 15 | 16 | ## More: Reinforcement learning in large/continuous action spaces 17 | While you already know algorithms that will work with continuously many actions, it can't hurt to learn something more specialized. 18 | * Lecture by J. Schulman - [video](https://www.youtube.com/watch?v=jmMsNQ2eug4) 19 | * Q-learning with normalized advantage functions - [article](https://arxiv.org/abs/1603.00748), [code1](https://github.com/carpedm20/NAF-tensorflow), [code2](http://bit.ly/2qx2087) 20 | * Deterministic policy gradient - [article](https://arxiv.org/pdf/1512.07679.pdf), [post+code](https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html) 21 | * Stochastic value gradient - [article](https://arxiv.org/abs/1510.09142) 22 | * Embedding large discrete action spaces for RL - [article](https://arxiv.org/pdf/1512.07679.pdf) 23 | * Lecture by A. Seleznev, 5vision (russian) - [video](www.youtube.com/watch?v=j1L2FnanXPo&t=119m45s) 24 | 25 | -------------------------------------------------------------------------------- /xvfb: -------------------------------------------------------------------------------- 1 | #taken from https://gist.github.com/jterrace/2911875 2 | XVFB=/usr/bin/Xvfb 3 | XVFBARGS=":1 -screen 0 1024x768x24 -ac +extension GLX +render -noreset" 4 | PIDFILE=./xvfb.pid 5 | case "$1" in 6 | start) 7 | echo -n "Starting virtual X frame buffer: Xvfb" 8 | start-stop-daemon --start --quiet --pidfile $PIDFILE --make-pidfile --background --exec $XVFB -- $XVFBARGS 9 | echo "." 10 | ;; 11 | stop) 12 | echo -n "Stopping virtual X frame buffer: Xvfb" 13 | start-stop-daemon --stop --quiet --pidfile $PIDFILE 14 | echo "." 15 | ;; 16 | restart) 17 | $0 stop 18 | $0 start 19 | ;; 20 | *) 21 | echo "Usage: /etc/init.d/xvfb {start|stop|restart}" 22 | exit 1 23 | esac 24 | 25 | exit 0 26 | -------------------------------------------------------------------------------- /yet_another_week/README.md: -------------------------------------------------------------------------------- 1 | __Our slides:__ __[inverse/imitation rl](https://yadi.sk/i/ngB_BcNx3UggK6);__ __[multi-agent 101](https://yadi.sk/i/XrFgmdCy3Vtd4k);__ __[multi-agent 102](https://docs.google.com/presentation/d/1AiSZnWGHWU_34QZ0fqdCGIXWUX_bC-0zz3kc3VVcPCA/edit?usp=sharing);__ __[hierarchical rl](https://yadi.sk/i/LkNiKxMz3Vtcr3)__ 2 | 3 | 4 | In this week you can find several sections covering advanced topics in RL, along with less advanced topics that we couldn't squeeze into the main track 5 | 6 | ## Other 7 | * Learning by imitation - [video](https://www.youtube.com/watch?v=kl_G95uKTHw), [assignment](http://rll.berkeley.edu/deeprlcourse/docs/hw1.pdf)(berkeley cs294) 8 | * Inverse reinforcement learning 9 | * Lecture by Chelsea Finn - [video](https://www.youtube.com/watch?v=d9DlQSJQAoI) 10 | * Udacity videos - [video](https://www.youtube.com/watch?v=h7uGyBcIeII) 11 | * Distributional RL - [video](https://www.youtube.com/watch?v=bsuvM1jO-4w) 12 | * Knowledge transfer in RL - [video](https://www.youtube.com/watch?v=Hx4XpVdJOI0)(berkeley cs294) 13 | * Hierarchical reinforcemnt learning 14 | * Cool article ( Fe__U__dal networks ) - [arxiv](https://arxiv.org/abs/1703.01161) 15 | * Short lecture by Roy Fox - [video](https://www.youtube.com/watch?v=x_QjJry0hTc) 16 | * Multi-Agent reinforcement learning 17 | * Lecture by Thoro Graepel - role of HRL in AI research - [video](https://www.youtube.com/watch?v=CvL-KV3IBcM) 18 | * Lecture by Balaraman Ravindran - [video](https://www.youtube.com/watch?v=K5MlmO0UJtI) 19 | 20 | ## A list of lists 21 | * [awesome_rl](https://github.com/aikorea/awesome-rl/) - a curated list of resources dedicated to reinforcement learning. 22 | * [junhyukoh's list](https://github.com/junhyukoh/deep-reinforcement-learning-papers) 23 | * [muupan's list](https://github.com/muupan/deep-reinforcement-learning-papers) 24 | * Courses: 25 | * [CS294: deep reinforcement learning](http://rll.berkeley.edu/deeprlcourse/) 26 | * [Silver's RL course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) 27 | * [Sutton's book, 2nd edition](http://incompleteideas.net/sutton/book/the-book-2nd.html) 28 | * [Implementations of many basic RL algorithms (raw and/or tensorflow)](https://github.com/dennybritz/reinforcement-learning) 29 | * Reddit: [General ML](https://www.reddit.com/r/MachineLearning/), [RL](https://www.reddit.com/r/reinforcementlearning/), [CS294](https://www.reddit.com/r/berkeleydeeprlcourse/) 30 | * [This great link you could have contributed] 31 | 32 | -------------------------------------------------------------------------------- /yet_another_week/_resource/README.md: -------------------------------------------------------------------------------- 1 | This is a utility folder to store images and other resources used in notebooks. 2 | -------------------------------------------------------------------------------- /yet_another_week/_resource/a3c_scheme.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/a3c_scheme.odp -------------------------------------------------------------------------------- /yet_another_week/_resource/conv_salary_architecture.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/conv_salary_architecture.odp -------------------------------------------------------------------------------- /yet_another_week/_resource/conv_salary_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/conv_salary_architecture.png -------------------------------------------------------------------------------- /yet_another_week/_resource/do_something_scst.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/do_something_scst.png -------------------------------------------------------------------------------- /yet_another_week/_resource/dqn_arch.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/dqn_arch.odp -------------------------------------------------------------------------------- /yet_another_week/_resource/dqn_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/dqn_arch.png -------------------------------------------------------------------------------- /yet_another_week/_resource/env_pool.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/env_pool.png -------------------------------------------------------------------------------- /yet_another_week/_resource/exp_replay.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/exp_replay.odp -------------------------------------------------------------------------------- /yet_another_week/_resource/exp_replay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/exp_replay.png -------------------------------------------------------------------------------- /yet_another_week/_resource/nerd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/nerd.png -------------------------------------------------------------------------------- /yet_another_week/_resource/nnet_arch.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/nnet_arch.odp -------------------------------------------------------------------------------- /yet_another_week/_resource/nnet_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/nnet_arch.png -------------------------------------------------------------------------------- /yet_another_week/_resource/pomdp_arch.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_arch.odp -------------------------------------------------------------------------------- /yet_another_week/_resource/pomdp_arch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_arch.pdf -------------------------------------------------------------------------------- /yet_another_week/_resource/pomdp_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_arch.png -------------------------------------------------------------------------------- /yet_another_week/_resource/pomdp_img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_img1.jpg -------------------------------------------------------------------------------- /yet_another_week/_resource/pomdp_img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_img2.jpg -------------------------------------------------------------------------------- /yet_another_week/_resource/pomdp_img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_img3.jpg -------------------------------------------------------------------------------- /yet_another_week/_resource/qlearning_scheme.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/qlearning_scheme.odp -------------------------------------------------------------------------------- /yet_another_week/_resource/qlearning_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/qlearning_scheme.png -------------------------------------------------------------------------------- /yet_another_week/_resource/rollout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/rollout.png -------------------------------------------------------------------------------- /yet_another_week/_resource/scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/scheme.png -------------------------------------------------------------------------------- /yet_another_week/_resource/target_net.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/target_net.odp -------------------------------------------------------------------------------- /yet_another_week/_resource/target_net.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/target_net.png -------------------------------------------------------------------------------- /yet_another_week/_resource/training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/training.png -------------------------------------------------------------------------------- /youtube_dl_lectures.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #this script utilizes https://github.com/rg3/youtube-dl/ to download __ENGLISH__ lecture materials in the respective folders. 3 | #you can install youtube-dl via `pip install --upgrade youtube-dl` if you don't have it already. 4 | #WARNING! the full script downloads gigabytes of mp4! 5 | 6 | #week0 7 | youtube-dl https://www.youtube.com/watch?v=2pWv7GOvuf0 --output week0_intro/Lecture1_Silver.mp4 8 | youtube-dl https://www.youtube.com/watch?v=lfHX2hHRMVQ --output week0_intro/Lecture2_Silver_optional.mp4 9 | 10 | #week1 11 | youtube-dl https://www.youtube.com/watch?v=aUrX-rP_ss4 --output week1_blackbox/Lecture_Schulman.mp4 12 | 13 | #week2 14 | youtube-dl https://www.youtube.com/watch?v=Nd1-UUMVfz4 --output week2_value_based/Lecture_Silver.mp4 15 | youtube-dl https://www.youtube.com/watch?v=i0o-ui1N35U --output week2_value_based/Alternative_lecture_Abbeel_part1.mp4 16 | youtube-dl https://www.youtube.com/watch?v=Csiiv6WGzKM --output week2_value_based/Alternative_lecture_Abbeel_part2.mp4 17 | youtube-dl https://www.youtube.com/watch?v=IL3gVyJMmhg --output week2_value_based/Alternative_lecture_Schulman.mp4 18 | 19 | #week3 20 | youtube-dl https://www.youtube.com/watch?v=PnHCvfgC_ZA --output week3_model_free/Lecture_Silver_part1.mp4 21 | youtube-dl https://www.youtube.com/watch?v=0g4j2k_Ggc4 --output week3_model_free/Lecture_Silver_part2.mp4 22 | youtube-dl https://www.youtube.com/watch?v=ifma8G7LegE --output week3_model_free/Alternative_lecture_Abbeel.mp4 23 | youtube-dl https://www.youtube.com/watch?v=IL3gVyJMmhg --output week3_model_free/Alternative_lecture_Schulmann.mp4 24 | 25 | #week3.5 26 | youtube-dl https://www.youtube.com/watch?v=uXt8qF2Zzfo --output week4_\[recap\]_deep_learning/Lecture_basics.mp4 27 | youtube-dl https://www.youtube.com/watch?v=FmpDIaiMIeA --output week4_\[recap\]_deep_learning/Lecture_convnets.mp4 28 | youtube-dl https://www.youtube.com/watch?v=OU8I1oJ9HhI --output week4_\[recap\]_deep_learning/Tutorial_theano.mp4 29 | 30 | #week4 31 | youtube-dl https://www.youtube.com/watch?v=UoPei5o4fps --output week4_approx_rl/Lecture_Silver.mp4 32 | youtube-dl https://www.youtube.com/watch?v=h1-pj4Y9-kM --output week4_approx_rl/Lecture_Schulman.mp4 33 | 34 | #week5 35 | youtube-dl https://www.youtube.com/watch?v=sGuiWX07sKw --output week5_explore/Lecture_Silver.mp4 36 | youtube-dl https://www.youtube.com/watch?v=SfCa1HQMkuw --output week5_explore/Lecture_Schulmann.mp4 37 | 38 | #week6 39 | youtube-dl https://www.youtube.com/watch?v=KHZVXao4qXs --output week6_policy_based/Lecture_Silver.mp4 40 | youtube-dl https://www.youtube.com/watch?v=BB-BhTn6DCM --output week6_policy_based/Alternative_lecture_Schulman_part1.mp4 41 | youtube-dl https://www.youtube.com/watch?v=Wnl-Qh2UHGg --output week6_policy_based/Alternative_lecture_Schulman_part2.mp4 42 | 43 | #week6.5 44 | youtube-dl https://www.youtube.com/watch?v=iX5V1WpxxkY --output week7_\[recap\]_rnn/Lecture_cs231.mp4 45 | youtube-dl https://www.youtube.com/watch?v=Ukgii7Yd_cU --output week7_\[recap\]_rnn/Alternative_lecture_nervana.mp4 46 | youtube-dl https://www.youtube.com/watch?v=xK-bzjIQkmM --output week7_\[recap\]_rnn/Alternative_lecture_Bengio.mp4 47 | youtube-dl https://www.youtube.com/watch?v=G5RY_SUJih4 --output week7_\[recap\]_rnn/Bonus_lecture_seq2seq.mp4 48 | 49 | #week7 50 | youtube-dl https://www.youtube.com/watch?v=yCqPMD6coO8 --output week7_pomdp/Lecture_Ng.mp4 51 | 52 | #week8 53 | #TODO 54 | 55 | #week9 56 | youtube-dl https://www.youtube.com/watch?v=_t5fpZuuf-4 --output week9_policy_II/Lecture_Schulmann.mp4 57 | 58 | --------------------------------------------------------------------------------