├── .gitignore
├── Amazon GPU howto.md
├── Dockerfile
├── LICENSE.md
├── README.md
├── docker
    ├── Dockerfile
    ├── README.md
    └── run_jupyter.sh
├── week1_intro
    ├── README.md
    ├── crossentropy_method.ipynb
    ├── pong.py
    ├── primer_python_for_ml
    │   ├── recap_ml.ipynb
    │   └── train.csv
    ├── project_starter_evolution_strategies.ipynb
    └── seminar_gym_interface.ipynb
├── week2_value_based
    ├── README.md
    ├── mdp.py
    ├── seminar1_VI.ipynb
    └── seminar2_MCTS.ipynb
├── week3_model_free
    ├── README.md
    ├── homework
    │   ├── expected_value_sarsa.py
    │   ├── expected_value_sarsa_epsilon_annealing.py
    │   ├── homework.ipynb
    │   ├── q_learning_epsilon_annealing.py
    │   ├── qlearning.py
    │   └── sarsa.py
    ├── seminar_alternative
    │   ├── qlearning.py
    │   └── seminar.ipynb
    └── seminar_main
    │   ├── analysis.py
    │   ├── crawler.py
    │   ├── environment.py
    │   ├── featureExtractors.py
    │   ├── game.py
    │   ├── ghostAgents.py
    │   ├── graphicsCrawlerDisplay.py
    │   ├── graphicsDisplay.py
    │   ├── graphicsGridworldDisplay.py
    │   ├── graphicsUtils.py
    │   ├── gridworld.py
    │   ├── how2run
    │   ├── keyboardAgents.py
    │   ├── layout.py
    │   ├── layouts
    │       ├── capsuleClassic.lay
    │       ├── contestClassic.lay
    │       ├── mediumClassic.lay
    │       ├── mediumGrid.lay
    │       ├── minimaxClassic.lay
    │       ├── openClassic.lay
    │       ├── originalClassic.lay
    │       ├── smallClassic.lay
    │       ├── smallGrid.lay
    │       ├── testClassic.lay
    │       ├── trappedClassic.lay
    │       └── trickyClassic.lay
    │   ├── learningAgents.py
    │   ├── mdp.py
    │   ├── pacman.py
    │   ├── pacmanAgents.py
    │   ├── qlearningAgents.py
    │   ├── run_crawler.sh
    │   ├── run_grid.sh
    │   ├── run_pacman.sh
    │   ├── textDisplay.py
    │   ├── textGridworldDisplay.py
    │   └── util.py
├── week4_[recap]_deep_learning
    ├── README.md
    ├── fix_my_nn.ipynb
    ├── mnist.py
    ├── notmnist.py
    ├── practice_lasagne.ipynb
    ├── practice_tensorflow.ipynb
    └── seminar_pytorch.ipynb
├── week4_approx_rl
    ├── README.md
    ├── framebuffer.py
    ├── homework_lasagne.ipynb
    ├── homework_pytorch.ipynb
    ├── homework_tf.ipynb
    ├── replay_buffer.py
    ├── seminar_lasagne.ipynb
    ├── seminar_pytorch.ipynb
    └── seminar_tf.ipynb
├── week5_explore
    ├── README.md
    ├── action_rewards.npy
    ├── all_states.npy
    ├── bayes.py
    ├── bnn.png
    ├── river_swim.png
    └── week5.ipynb
├── week6_policy_based
    ├── README.md
    ├── atari_util.py
    ├── homework_lasagne.ipynb
    ├── homework_tensorflow.ipynb
    ├── reinforce_lasagne.ipynb
    ├── reinforce_pytorch.ipynb
    └── reinforce_tensorflow.ipynb
├── week7_[recap]_rnn
    ├── README.md
    ├── mtg_card_names.txt
    ├── names
    ├── rnn.png
    ├── seminar_lasagne.ipynb
    ├── seminar_lasagne_ingraph.ipynb
    ├── seminar_pytorch.ipynb
    └── seminar_tf.ipynb
├── week7_pomdp
    ├── README.md
    ├── atari_util.py
    ├── env_pool.py
    ├── homework_common_part2.ipynb
    ├── img1.jpg
    ├── img2.jpg
    ├── img3.jpg
    ├── practice_pytorch.ipynb
    ├── practice_tensorflow.ipynb
    ├── practice_theano.ipynb
    └── theano_optional_recurrence_tutorial.ipynb
├── week8_scst
    ├── README.md
    ├── basic_model_tf.py
    ├── basic_model_theano.py
    ├── basic_model_torch.py
    ├── bonus.ipynb
    ├── he-pron-wiktionary.txt
    ├── main_dataset.txt
    ├── practice_tf.ipynb
    ├── practice_theano.ipynb
    ├── practice_torch.ipynb
    ├── scheme.svg
    └── voc.py
├── week9_policy_II
    ├── README.md
    ├── seminar_TRPO_pytorch.ipynb
    ├── seminar_TRPO_tensorflow.ipynb
    └── seminar_TRPO_theano.ipynb
├── xvfb
├── yet_another_week
    ├── README.md
    └── _resource
    │   ├── README.md
    │   ├── a3c_scheme.odp
    │   ├── conv_salary_architecture.odp
    │   ├── conv_salary_architecture.png
    │   ├── do_something_scst.png
    │   ├── dqn_arch.odp
    │   ├── dqn_arch.png
    │   ├── env_pool.png
    │   ├── exp_replay.odp
    │   ├── exp_replay.png
    │   ├── nerd.png
    │   ├── nnet_arch.odp
    │   ├── nnet_arch.png
    │   ├── pomdp_arch.odp
    │   ├── pomdp_arch.pdf
    │   ├── pomdp_arch.png
    │   ├── pomdp_img1.jpg
    │   ├── pomdp_img2.jpg
    │   ├── pomdp_img3.jpg
    │   ├── qlearning_scheme.odp
    │   ├── qlearning_scheme.pgm
    │   ├── qlearning_scheme.png
    │   ├── rollout.png
    │   ├── scheme.png
    │   ├── target_net.odp
    │   ├── target_net.png
    │   └── training.png
└── youtube_dl_lectures.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | # node and NPM
 2 | npm-debug.log
 3 | node_modules
 4 | ..bfg-report
 5 | 
 6 | # swap files
 7 | *~
 8 | *.swp
 9 | 
10 | 
11 | 
12 | env.sh
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | 
17 | # C extensions
18 | *.so
19 | 
20 | # Distribution / packaging
21 | .Python
22 | env/
23 | bin/
24 | build/
25 | develop-eggs/
26 | dist/
27 | eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg/
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | 
49 | # Translations
50 | *.mo
51 | 
52 | # Mr Developer
53 | .mr.developer.cfg
54 | .project
55 | .pydevproject
56 | .idea
57 | .ipynb_checkpoints
58 | 
59 | # Rope
60 | .ropeproject
61 | 
62 | # Django stuff:
63 | *.log
64 | *.pot
65 | 
66 | # Sphinx documentation
67 | docs/_build/
68 | docs/tmp*
69 | 
70 | # OS X garbage
71 | .DS_Store
72 | 
73 | # Debian things
74 | debian/reproducible-experiment-platform
75 | debian/files
76 | *.substvars
77 | *.debhelper.log
78 | 


--------------------------------------------------------------------------------
/Amazon GPU howto.md:
--------------------------------------------------------------------------------
 1 | # How to set up GPU on EC2 instance
 2 | 
 3 | ## Create EC2 instance
 4 | 
 5 | Use `p2.xlarge` instance type and `ami-e00a8180` AMI image. [Details](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html)
 6 | 
 7 | Open ports `22` (ssh) and `80` (http) on your freshly created instance, 
 8 | you create a [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) 
 9 | and attach it your instance to get ports open
10 | 
11 | ## Launch notebook
12 | 
13 | Instance you have created contains all you need: fresh versions of theano, lasagne, CUDA driver and cuDNN, 
14 | just lunch ipython and get hands dirty:
15 | 
16 | ```bash
17 | $ sudo su
18 | $ export THEANO_FLAGS='cuda.root=/usr/local/cuda,device=gpu,floatX=float32'
19 | $ export PATH=/usr/local/cuda-8.0/bin${PATH:+:${PATH}}
20 | $ jupyter notebook
21 | ```
22 | 
23 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM andrewosh/binder-base
 2 | MAINTAINER Alexander Panin <justheuristic@gmail.com>
 3 | USER root
 4 | 
 5 | RUN echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" >> /etc/apt/sources.list
 6 | RUN apt-get -qq update
 7 | 
 8 | RUN apt-get install -y gcc-4.9 g++-4.9 libstdc++6 wget unzip
 9 | RUN apt-get install -y libopenblas-dev liblapack-dev libsdl2-dev libboost-all-dev graphviz
10 | RUN apt-get install -y cmake zlib1g-dev libjpeg-dev 
11 | RUN apt-get install -y xvfb libav-tools xorg-dev python-opengl python3-opengl
12 | RUN apt-get -y install swig3.0
13 | RUN ln -s /usr/bin/swig3.0 /usr/bin/swig
14 | 
15 | 
16 | USER main
17 | RUN pip install --upgrade pip==9.0.3
18 | RUN pip install --upgrade --ignore-installed setuptools  #fix https://github.com/tensorflow/tensorflow/issues/622
19 | RUN pip install --upgrade sklearn tqdm nltk editdistance joblib graphviz
20 | 
21 | # install all gym stuff except mujoco - it fails at "import importlib.util" (no module named util)
22 | RUN pip install --upgrade gym
23 | RUN pip install --upgrade gym[atari]
24 | RUN pip install --upgrade gym[box2d]
25 | 
26 | RUN pip install --upgrade http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl 
27 | RUN pip install --upgrade torchvision 
28 | RUN pip install --upgrade keras
29 | RUN pip install --upgrade https://github.com/Theano/Theano/archive/master.zip
30 | RUN pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip
31 | RUN pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip
32 | RUN pip install gym_pull
33 | RUN pip install ppaquette-gym-doom
34 | 
35 | 
36 | 
37 | 
38 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade pip==9.0.3
39 | 
40 | # fix https://github.com/tensorflow/tensorflow/issues/622
41 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade --ignore-installed setuptools
42 | 
43 | # python3: fix `GLIBCXX_3.4.20' not found - conda's libgcc blocked system's gcc-4.9 and libstdc++6
44 | RUN bash -c "conda update -y conda && source activate python3 && conda uninstall -y libgcc && source deactivate"
45 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade matplotlib numpy scipy pandas graphviz
46 | 
47 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade sklearn tqdm nltk editdistance joblib
48 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade --ignore-installed setuptools  #fix https://github.com/tensorflow/tensorflow/issues/622
49 | 
50 | # install all gym stuff except mujoco - it fails at "mjmodel.h: no such file or directory"
51 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym
52 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym[atari]
53 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym[box2d]
54 | 
55 | 
56 | 
57 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl 
58 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade torchvision
59 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade keras
60 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Theano/Theano/archive/master.zip
61 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip
62 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip
63 | 
64 | #install TF after everything else not to break python3's pyglet with python2's tensorflow
65 | RUN pip install --upgrade tensorflow==1.4.0
66 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade tensorflow==1.4.0
67 | #TODO py3 doom once it's no longer broken
68 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Practical_RL
  2 | A course on reinforcement learning in the wild.
  3 | Taught on-campus at [HSE](https://cs.hse.ru) and [YSDA](https://yandexdataschool.com/)  and maintained to be friendly to online students (both english and russian).
  4 | 
  5 | 
  6 | #### Manifesto:
  7 | * __Optimize for the curious.__ For all the materials that aren’t covered in detail there are links to more information and related materials (D.Silver/Sutton/blogs/whatever). Assignments will have bonus sections if you want to dig deeper.
  8 | * __Practicality first.__ Everything essential to solving reinforcement learning problems is worth mentioning. We won't shun away from covering tricks and heuristics. For every major idea there should be a lab that makes you to “feel” it on a practical problem.
  9 | * __Git-course.__ Know a way to make the course better? Noticed a typo in a formula? Found a useful link? Made the code more readable? Made a version for alternative framework? You're awesome! [Pull-request](https://help.github.com/articles/about-pull-requests/) it!
 10 | 
 11 | # Course info
 12 | * Lecture slides are [here](https://yadi.sk/d/loPpY45J3EAYfU).
 13 | * Telegram chat room for YSDA & HSE students is [here](https://t.me/rlspring18)
 14 | * Grading rules for YSDA & HSE students is [here](https://github.com/yandexdataschool/Practical_RL/wiki/Homeworks-and-grading)
 15 | * Online student __[survival guide](https://github.com/yandexdataschool/Practical_RL/wiki/Online-student's-survival-guide)__
 16 | * Installing the libraries - [guide and issues thread](https://github.com/yandexdataschool/Practical_RL/issues/1)
 17 | * Magical button that launches you into course environment: 
 18 |     * [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/yandexdataschool/Practical_RL/master) - comes will all libraries pre-installed. May be down time to time.
 19 |     * If it's down, try [__google colab__](https://colab.research.google.com/) or [__azure notebooks__](http://notebooks.azure.com/). Those last longer, but they will require you to run installer commands (see ./Dockerfile).
 20 | * Anonymous [feedback form](https://docs.google.com/forms/d/e/1FAIpQLSdurWw97Sm9xCyYwC8g3iB5EibITnoPJW2IkOVQYE_kcXPh6Q/viewform) for everything that didn't go through e-mail.
 21 | * [About the course](https://github.com/yandexdataschool/Practical_RL/wiki/Practical-RL)
 22 | 
 23 | # Additional materials
 24 | * A large list of RL materials - [awesome rl](https://github.com/aikorea/awesome-rl)
 25 | * [RL reading group](https://github.com/yandexdataschool/Practical_RL/wiki/RL-reading-group)
 26 | 
 27 | 
 28 | # Syllabus
 29 | 
 30 | The syllabus is approximate: the lectures may occur in a slightly different order and some topics may end up taking two weeks.
 31 | 
 32 | * [__week1__](https://github.com/yandexdataschool/Practical_RL/tree/master/week1_intro) RL as blackbox optimization
 33 |   * Lecture: RL problems around us. Decision processes. Stochastic optimization, Crossentropy method. Parameter space search vs action space search.
 34 |   * Seminar: Welcome into openai gym. Tabular CEM for Taxi-v0, deep CEM for box2d environments.
 35 |   * Homework description - see week1/README.md. 
 36 |   * ** YSDA Deadline: 2018.02.26 23.59**
 37 |   * ** HSE Deadline: 2018.01.28 23:59**
 38 |   
 39 | * [__week2__](https://github.com/yandexdataschool/Practical_RL/tree/master/week2_value_based) Value-based methods
 40 |   * Lecture: Discounted reward MDP. Value-based approach. Value iteration. Policy iteration. Discounted reward fails.
 41 |   * Seminar: Value iteration.  
 42 |   * Homework description - see week2/README.md. 
 43 |   * ** HSE Deadline: 2018.02.11 23:59**
 44 |   * ** YSDA Deadline: part1 2018.03.05 23.59, part2 2018.03.12 23.59**
 45 |   
 46 | 
 47 | * [__week3__](https://github.com/yandexdataschool/Practical_RL/tree/master/week3_model_free) Model-free reinforcement learning
 48 |   * Lecture: Q-learning. SARSA. Off-policy Vs on-policy algorithms. N-step algorithms. TD(Lambda).
 49 |   * Seminar: Qlearning Vs SARSA Vs Expected Value SARSA
 50 |   * Homework description - see week3/README.md. 
 51 |   * **HSE Deadline: 2018.02.15 23:59**
 52 |   * ** YSDA Deadline: 2018.03.12 23.59**
 53 |      
 54 | * [__week4_recap__](https://github.com/yandexdataschool/Practical_RL/tree/master/week4_%5Brecap%5D_deep_learning) - deep learning recap 
 55 |   * Lecture: Deep learning 101
 56 |   * Seminar: Simple image classification with convnets
 57 | 
 58 | * [__week4__](https://github.com/yandexdataschool/Practical_RL/tree/master/week4_approx_rl) Approximate reinforcement learning
 59 |   * Lecture: Infinite/continuous state space. Value function approximation. Convergence conditions. Multiple agents trick; experience replay, target networks, double/dueling/bootstrap DQN, etc.
 60 |   * Seminar:  Approximate Q-learning with experience replay. (CartPole, Atari)
 61 |   * **HSE Deadline: 2018.03.04 23:30**
 62 |   * ** YSDA Deadline: 2018.03.20 23.30**
 63 | 
 64 | * [__week5__](https://github.com/yandexdataschool/Practical_RL/tree/master/week5_explore) Exploration in reinforcement learning
 65 |   * Lecture: Contextual bandits. Thompson Sampling, UCB, bayesian UCB. Exploration in model-based RL, MCTS. "Deep" heuristics for exploration.
 66 |   * Seminar: bayesian exploration for contextual bandits. UCB for MCTS.
 67 |   
 68 |   * ** YSDA Deadline: 2018.03.30 23.30**
 69 | 
 70 | * [__week6__](https://github.com/yandexdataschool/Practical_RL/tree/master/week6_policy_based) Policy gradient methods I
 71 |   * Lecture: Motivation for policy-based, policy gradient, logderivative trick, REINFORCE/crossentropy method, variance reduction(baseline), advantage actor-critic (incl. GAE)
 72 |   * Seminar: REINFORCE, advantage actor-critic
 73 | 
 74 | * [__week7_recap__](https://github.com/yandexdataschool/Practical_RL/tree/master/week7_%5Brecap%5D_rnn) Recurrent neural networks recap
 75 |   * Lecture: Problems with sequential data. Recurrent neural netowks. Backprop through time. Vanishing & exploding gradients. LSTM, GRU. Gradient clipping
 76 |   * Seminar: character-level RNN language model
 77 | 
 78 | * [__week7__](https://github.com/yandexdataschool/Practical_RL/tree/master/week7_pomdp) Partially observable MDPs
 79 |   * Lecture: POMDP intro. POMDP learning (agents with memory). POMDP planning (POMCP, etc)
 80 |   * Seminar: Deep kung-fu & doom with recurrent A3C and DRQN
 81 |     
 82 | * [__week8__](https://github.com/yandexdataschool/Practical_RL/tree/master/week8_scst) Applications II
 83 |   * Lecture: Reinforcement Learning as a general way to optimize non-differentiable loss. G2P, machine translation, conversation models, image captioning, discrete GANs. Self-critical sequence training.
 84 |   * Seminar: Simple neural machine translation with self-critical sequence training
 85 | 
 86 | * [__week9__](https://github.com/yandexdataschool/Practical_RL/tree/master/week9_policy_II) Policy gradient methods II
 87 |   * Lecture: Trust region policy optimization. NPO/PPO. Deterministic policy gradient. DDPG. Bonus: DPG for discrete action spaces.
 88 |   * Seminar: Approximate TRPO for simple robotic tasks.
 89 | 
 90 | * [Some after-course bonus materials](https://github.com/yandexdataschool/Practical_RL/tree/master/yet_another_week)
 91 |   
 92 | 
 93 | # Course staff
 94 | Course materials and teaching by: _[unordered]_
 95 | - [Pavel Shvechikov](https://github.com/bestxolodec) - lectures, seminars, hw checkups, reading group
 96 | - [Oleg Vasilev](https://github.com/Omrigan) - seminars, hw checkups, technical support
 97 | - [Alexander Fritsler](https://github.com/Fritz449) - lectures, seminars, hw checkups
 98 | - [Nikita Putintsev](https://github.com/qwasser) - seminars, hw checkups, organizing our hot mess
 99 | - [Fedor Ratnikov](https://github.com/justheuristic/) - lectures, seminars, hw checkups
100 | - [Alexey Umnov](https://github.com/alexeyum) - seminars, hw checkups
101 | 
102 | # Contributions
103 | * Using pictures from [Berkeley AI course](http://ai.berkeley.edu/home.html)
104 | * Massively refering to [CS294](http://rll.berkeley.edu/deeprlcourse/)
105 | * Sevaral tensorflow assignments by [Scitator](https://github.com/Scitator)
106 | * A lot of fixes from [arogozhnikov](https://github.com/arogozhnikov)
107 | * Other awesome people: see github contributors
108 | 
109 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | LABEL maintainer "Alexander Panin <justheuristic@gmail.com>, Dmitry Mittov <mittov@gmail.com>"
 3 | 
 4 | 
 5 | RUN echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
 6 |     apt-get -qq update && \
 7 |     apt-get install -y cmake \
 8 |                        wget \
 9 |                        unzip \
10 |                        git \
11 |                        zlib1g-dev \
12 |                        libjpeg-dev \
13 |                        xvfb \
14 |                        libav-tools \
15 |                        xorg-dev \
16 |                        python-opengl \
17 |                        swig3.0 \
18 |                        python-dev \
19 |                        python3-dev \
20 |                        python-pip \
21 |                        python3-pip \
22 |                        libopenblas-dev \
23 |                        liblapack-dev \
24 |                        libsdl2-dev \
25 |                        libboost-all-dev \
26 |                        graphviz \
27 |                        gcc \
28 |                        g++ && \
29 |     ln -s /usr/bin/swig3.0 /usr/bin/swig
30 | 
31 | RUN pip install --upgrade pip==9.0.3 && \
32 |     pip install --upgrade numpy scipy && \
33 |     pip install --upgrade sklearn \
34 |                            jupyter \
35 |                            tqdm \
36 |                            graphviz \
37 |                            gym gym[box2d] gym[atari] \
38 |                            matplotlib \
39 |                            seaborn && \
40 |     pip install --upgrade https://github.com/Theano/Theano/archive/master.zip \
41 |                            https://github.com/Lasagne/Lasagne/archive/master.zip \
42 |                            https://github.com/yandexdataschool/AgentNet/archive/master.zip \
43 |                            tensorflow \
44 |                            http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl \
45 |                            torchvision \
46 |                            keras     
47 |                            
48 | RUN pip install --upgrade  gym_pull ppaquette-gym-doom
49 | 
50 | 
51 | RUN pip3 install --upgrade pip==9.0.3 && \
52 |     pip3 install --upgrade numpy scipy && \
53 |     pip3 install --upgrade sklearn \
54 |                            jupyter \
55 |                            tqdm \
56 |                            graphviz \
57 |                            gym gym[box2d] gym[atari] \
58 |                            matplotlib \
59 |                            seaborn && \
60 |     pip3 install --upgrade https://github.com/Theano/Theano/archive/master.zip \
61 |                            https://github.com/Lasagne/Lasagne/archive/master.zip \
62 |                            https://github.com/yandexdataschool/AgentNet/archive/master.zip \
63 |                            http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl \
64 |                            torchvision \
65 |                            tensorflow \
66 |                            keras && \                           
67 |     python3 -m ipykernel.kernelspec
68 | 
69 | 
70 | EXPOSE 8888
71 | VOLUME /notebooks
72 | WORKDIR /notebooks
73 | 
74 | COPY run_jupyter.sh /
75 | CMD ["/run_jupyter.sh"]
76 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | To simplify installation process, you can deploy a container (~virtual machine) with all dependencies pre-installed.
 2 | 
 3 | _tl;dr [dockerhub url](https://hub.docker.com/r/justheuristic/practical_rl/)_
 4 | 
 5 | ## Install Docker
 6 | 
 7 | We recommend you to use either native docker (recommended for linux) or kitematic(recommended for windows).
 8 | * Installing [kitematic](https://kitematic.com/), a simple interface to docker (all platforms)
 9 | * Pure docker: Guide for [windows](https://docs.docker.com/docker-for-windows/), [linux](https://docs.docker.com/engine/installation/), or [macOS](https://docs.docker.com/docker-for-mac/).
10 | 
11 | Below are the instructions for both approaches.
12 | 
13 | ## Kitematic
14 | Find justheuristic/practical_rl in the search menu. Download and launch the container.
15 | 
16 | Click on "web preview" screen in the top-right __or__ go to settings, ports and find at which port your jupyter is located, usually 32***.
17 | 
18 | ## Native
19 | `docker run -it -v <local_dir>:/notebooks -p <local_port>:8888 justheuristic/practical_rl sh ../run_jupyter.sh`
20 | 
21 | `docker run -it -v /Users/mittov/Documents/shad/semester4/:/notebooks -p 8888:8888 justheuristic/practical_rl sh ../run_jupyter.sh`
22 | 
23 | ## Manual
24 | Build container
25 | 
26 | `$ docker build -t rl .`
27 | 
28 | 
29 | Run it
30 | 
31 | `$ docker run --rm -it -v <local_dir>:/notebooks -p <local_port>:8888 dl`
32 | 
33 | examples:
34 | 
35 | `$ docker run --rm -it -v /Users/mittov/Documents/shad/semester4/:/notebooks -p 8888:8888 dl`
36 | 
37 | Copy the token from console and run
38 | http://localhost:8888/?token=<token>
39 | 


--------------------------------------------------------------------------------
/docker/run_jupyter.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | jupyter notebook --no-browser --allow-root --ip 0.0.0.0
3 | 
4 | 


--------------------------------------------------------------------------------
/week1_intro/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials:
 2 | * [__Lecture slides__](https://yadi.sk/i/sbc0ZCKx3RRGbW)
 3 | * __Russian:__
 4 |   * Intro to RL - [video](https://yadi.sk/i/bMo0qa-x3DoqkS)
 5 |   * Blackbox optiization - [video](https://yadi.sk/i/5yf_4oGI3EDJhJ)
 6 |   * Seminar - [video](https://yadi.sk/i/dPsWYMK13EDJj7) _only covering crossentropy method_
 7 | 
 8 | * __English:__
 9 |   * [__main__] Video-intro by David Silver (english) - [video](https://www.youtube.com/watch?v=2pWv7GOvuf0)
10 |   * [__main__] Lecture by J Schulman with crossentropy method explained (english) - [url](https://www.youtube.com/watch?v=aUrX-rP_ss4&list=PLCTc_C7itk-GaAMxmlChrkPnGKtjz8hv1)
11 |   * Optional lecture by David Silver (english) - [video](https://www.youtube.com/watch?v=lfHX2hHRMVQ)
12 | 
13 | 
14 | ## More materials:
15 | * __[recommended]__ - awesome openai post about evolution strategies - [blog post](https://blog.openai.com/evolution-strategies/), [article](https://arxiv.org/abs/1703.03864)
16 | * Deep learning course (if you want to learn in parallel) - https://github.com/yandexdataschool/HSE_deeplearning
17 | * Video on genetic algorithms (english) - [video](https://www.youtube.com/watch?v=ejxfTy4lI6I)
18 | * Another guide to genetic algorithm (english) - [video](https://www.youtube.com/watch?v=zwYV11a__HQ)
19 | * About Differential evolution (english) - [pdf](http://jvanderw.une.edu.au/DE_1.pdf)
20 | * Video on Ant Colony Algorithm (english) - [video](https://www.youtube.com/watch?v=D58nLNLkb0I)
21 | * Longer video on Ant Colony Algorithm (english) - [video](https://www.youtube.com/watch?v=xpyKmjJuqhk)
22 | 
23 | 
24 | ## Homework description
25 | * Open `gym_interface.ipynb` and follow instructions from there
26 |   * If you haven't installed everything yet, try [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/yandexdataschool/Practical_RL/master)
27 | * After you're done there, proceed to `crossentropy_method.ipynb`
28 | * You can find homework and bonus assignment descriptions at the end of that notebook.
29 | * Note: so far it's enough to say `pip install gym` on top of any data-science-stuffed python, but we'd appreciate if you gradually switch to [full installation](https://github.com/openai/gym#installing-everything).
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/week1_intro/pong.py:
--------------------------------------------------------------------------------
 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient"""
 2 | import numpy as np
 3 | import gym
 4 | from scipy.misc import imresize
 5 | from gym.core import Wrapper
 6 | from gym.spaces.box import Box
 7 | 
 8 | def make_pong():
 9 |     """creates breakout env with all preprocessing done for you"""
10 |     return PreprocessAtari(gym.make("PongDeterministic-v0"))
11 | 
12 | class PreprocessAtari(Wrapper):
13 |     def __init__(self,env,height=42,width=42,
14 |                  crop=lambda img: img[34:34+160],n_frames=4):
15 |         """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
16 |         super(PreprocessAtari, self).__init__(env)
17 |         self.img_size = (height,width)
18 |         self.crop=crop
19 |         self.observation_space = Box(0.0, 1.0, [n_frames,height,width])
20 |         self.framebuffer = np.zeros([n_frames,height,width])
21 |     def reset(self):
22 |         """resets breakout, returns initial frames"""
23 |         self.framebuffer = np.zeros_like(self.framebuffer)
24 |         self.update_buffer(self.env.reset())
25 |         return self.framebuffer
26 |     def step(self,action):
27 |         """plays breakout for 1 step, returns 4-frame buffer"""
28 |         new_img,r,done,info = self.env.step(action)
29 |         self.update_buffer(new_img)
30 |         return self.framebuffer,r,done,info
31 |     
32 |     ###image processing###
33 |     
34 |     def update_buffer(self,img):
35 |         img = self.preproc_image(img)
36 |         self.framebuffer = np.vstack([img[None], self.framebuffer[:-1]])
37 | 
38 |     def preproc_image(self, img):
39 |         """what happens to the observation"""
40 |         img = self.crop(img)
41 |         img = imresize(img, self.img_size).mean(-1)
42 |         img = img.astype('float32')/255.
43 |         return img
44 | 


--------------------------------------------------------------------------------
/week1_intro/project_starter_evolution_strategies.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Project :: Evolution Strategies\n",
  8 |     "\n",
  9 |     "![img](https://t4.ftcdn.net/jpg/00/17/46/81/240_F_17468143_wY3hsHyfNYoMdG9BlC56HI4JA7pNu63h.jpg)\n",
 10 |     "\n",
 11 |     "Remember the idea behind Evolution Strategies? Here's a neat [blog post](https://blog.openai.com/evolution-strategies/) about 'em.\n",
 12 |     "\n",
 13 |     "Can you reproduce their success? You will have to implement evolutionary strategies and see how they work.\n",
 14 |     "\n",
 15 |     "This project is optional; has several milestones each worth a number of points [and swag].\n",
 16 |     "\n",
 17 |     "__Milestones:__\n",
 18 |     "* [10pts] Basic prototype of evolutionary strategies that works in one thread on CartPole\n",
 19 |     "* [+5pts] Modify the code to make them work in parallel\n",
 20 |     "* [+5pts] if you can run ES distributedly on at least two PCs\n",
 21 |     "* [+10pts] Apply ES to play Atari Pong at least better than random\n",
 22 |     "* [++] Additional points for all kinds of cool stuff besides milestones\n",
 23 |     "\n",
 24 |     "__Rules:__\n",
 25 |     "\n",
 26 |     "* This is __not a mandatory assignment__, but it's a way to learn some cool things if you're getting bored with default assignments.\n",
 27 |     "* Once you decided to take on this project, please tell any of course staff members so that we can help ypu if you get stuck.\n",
 28 |     "* There's a default implementation of ES in this [openai repo](https://github.com/openai/evolution-strategies-starter). It's okay to look there if you get stuck or want to compare your solutions, but each copy-pasted chunk of code should be understood thoroughly. We'll test that with questions."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "### Tips on implementation\n",
 36 |     "\n",
 37 |     "* It would be very convenient later if you implemented a function that takes policy weights, generates a session and returns policy changes -- so that you could then run a bunch of them in parallel.\n",
 38 |     "\n",
 39 |     "* The simplest way you can do multiprocessing is to use [joblib](https://www.google.com/search?client=ubuntu&channel=fs&q=joblib&ie=utf-8&oe=utf-8)\n",
 40 |     "\n",
 41 |     "* For joblib, make sure random variables are independent in each job. Simply add `np.random.seed()` at the beginning of your \"job\" function.\n",
 42 |     "\n",
 43 |     "Later once you got distributed, you may need a storage that gathers gradients from all workers. In such case we recommend [Redis](https://redis.io/) due to it's simplicity.\n",
 44 |     "\n",
 45 |     "Here's a speed-optimized saver/loader to store numpy arrays in Redis as strings.\n",
 46 |     "\n"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "import joblib\n",
 58 |     "from six import BytesIO\n",
 59 |     "def dumps(data):\n",
 60 |     "    \"\"\"converts whatever to string\"\"\"\n",
 61 |     "    s = BytesIO()\n",
 62 |     "    joblib.dump(data,s)\n",
 63 |     "    return s.getvalue()\n",
 64 |     "        \n",
 65 |     "def loads(self,string):\n",
 66 |     "    \"\"\"converts string to whatever was dumps'ed in it\"\"\"\n",
 67 |     "    return joblib.load(BytesIO(string))\n"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "### Tips on atari games\n",
 75 |     "* There's all the pre-processing and tuning done for you in the code below\n",
 76 |     "    * Images rescaled to 42x42 to speed up computation\n",
 77 |     "    * We use last 4 frames as observations to account for ball velocity\n",
 78 |     "    * The code below requires ```pip install Image``` and ```pip install gym[atari]``` \n",
 79 |     "    * You may also need some dependencies for gym[atari] - google \"gym install all\" dependencies or use our pre-built environment.\n",
 80 |     "* The recommended agent architecture is a convolutional neural network. Dense network will also do.\n",
 81 |     "\n",
 82 |     "\n",
 83 |     "May the force be with you!"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "from pong import make_pong\n",
 95 |     "import numpy as np\n",
 96 |     "\n",
 97 |     "env = make_pong()\n",
 98 |     "print(env.action_space)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "#get the initial state\n",
110 |     "s = env.reset()\n",
111 |     "print (s.shape)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "import matplotlib.pyplot as plt\n",
123 |     "%matplotlib inline\n",
124 |     "#plot first observation. Only one frame\n",
125 |     "plt.imshow(s.swapaxes(1,2).reshape(-1,s.shape[-1]).T)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "collapsed": false
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "#next frame\n",
137 |     "new_s,r,done, _ = env.step(env.action_space.sample())\n",
138 |     "plt.imshow(new_s.swapaxes(1,2).reshape(-1,s.shape[-1]).T)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "#after 10 frames\n",
150 |     "for _ in range(10):\n",
151 |     "    new_s,r,done, _ = env.step(env.action_space.sample())\n",
152 |     "\n",
153 |     "plt.imshow(new_s.swapaxes(1,2).reshape(-1,s.shape[-1]).T,vmin=0)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "< tons of your code here or elsewhere >"
165 |    ]
166 |   }
167 |  ],
168 |  "metadata": {
169 |   "kernelspec": {
170 |    "display_name": "Python 2",
171 |    "language": "python",
172 |    "name": "python2"
173 |   },
174 |   "language_info": {
175 |    "codemirror_mode": {
176 |     "name": "ipython",
177 |     "version": 2
178 |    },
179 |    "file_extension": ".py",
180 |    "mimetype": "text/x-python",
181 |    "name": "python",
182 |    "nbconvert_exporter": "python",
183 |    "pygments_lexer": "ipython2",
184 |    "version": "2.7.13"
185 |   }
186 |  },
187 |  "nbformat": 4,
188 |  "nbformat_minor": 2
189 | }
190 | 


--------------------------------------------------------------------------------
/week2_value_based/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [__Lecture slides__](https://docs.google.com/presentation/d/1Tnt4w0DDCwgGIo8Dh9-004veILxHekXOlTddmz0O5Tc/edit?usp=sharing)
 3 | * Our videos: [lecture](https://yadi.sk/i/PeaLQ3IG3SeZML) [seminar](https://yadi.sk/i/hrnHB9DK3SeZRC) (russian)
 4 | * __[main]__ lecture by David Silver - [url](https://www.youtube.com/watch?v=Nd1-UUMVfz4)
 5 | * Alternative lecture by Pieter Abbeel (english): [part 1](https://www.youtube.com/watch?v=i0o-ui1N35U), [part 2](https://www.youtube.com/watch?v=Csiiv6WGzKM)
 6 | * Alternative lecture by John Schulmann (english): [video](https://www.youtube.com/watch?v=IL3gVyJMmhg)
 7 | * Definitive guide in policy/value iteration from Sutton: start from page 81 [here](http://incompleteideas.net/sutton/book/bookdraft2017june19.pdf).
 8 | 
 9 | 
10 | ## Materials: planning
11 | * Planning by dynamic programming (D. Silver) - [video](https://www.youtube.com/watch?v=Nd1-UUMVfz4)
12 | * Planning via tree search [videos 2-6 from CS188](https://www.youtube.com/channel/UCHBzJsIcRIVuzzHVYabikTQ)
13 | * Our lecture:
14 |   * Slides [part1](https://yadi.sk/i/3PM9zCP33J3ub3) (intro), [part2](https://yadi.sk/i/M03xvZ2y3JMQre) (pomdp)
15 |   * [Lecture](https://yadi.sk/i/lOAUu7o13JBHFz) & [seminar](https://yadi.sk/i/bkmjEZrk3JBHGF)
16 | * Monte-carlo tree search
17 |   *  Udacity video on monte-carlo tree search (first part of a chain) - [video](https://www.youtube.com/watch?v=onBYsen2_eA)
18 |   * Reminder: UCB-1 - [slides](https://www.cs.bham.ac.uk/internal/courses/robotics/lectures/ucb1.pdf)
19 |   * Monte-carlo tree search step-by-step by J.Levine - [video](https://www.youtube.com/watch?v=UXW2yZndl7U)
20 |   * Guide to MCTS (monte-carlo tree search) - [post](http://www.cameronius.com/research/mcts/about/index.html)
21 |   * Another guide to MCTS - [url](https://jeffbradberry.com/posts/2015/09/intro-to-monte-carlo-tree-search/)
22 | * Integrating learning and planning (D. Silver) - [video](https://www.youtube.com/watch?v=ItMutbeOHtc)
23 | * Approximating the MCTS optimal actions - 5vision solution for deephack.RL, code by Mikhail Pavlov - [repo](https://github.com/5vision/uct_atari)
24 | 
25 | 
26 | 
27 | ## Homework description:
28 | 
29 | The main assignment is `seminar1_VI.ipynb` notebook in this week's folder.
30 | 
31 | If you're interested in model-based RL at scale, go through __Materials: planning__ section and proceed with `seminar2_MCTS.ipynb` notebook.
32 | 


--------------------------------------------------------------------------------
/week3_model_free/README.md:
--------------------------------------------------------------------------------
 1 | #### __Lecture slides__ - [here](https://yadi.sk/i/54qWKtDB3NDeuh)
 2 | ### Materials
 3 | * Russian materials:
 4 |    - Lecture - [video](https://yadi.sk/i/jcQ1Bg8n3SrhuQ)
 5 |    - Q-learning seminar - [video](https://yadi.sk/i/dQmolwOy3EtGNK)
 6 |    - Sarsa & stuff - [seminar2](https://yadi.sk/i/XbqNQmjm3ExNsq)
 7 | * English materials:
 8 |    - Lecture by David Silver (english) - [video part I](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [video part II](https://www.youtube.com/watch?v=0g4j2k_Ggc4&t=43s)
 9 |    - Alternative lecture by Pieter Abbeel (english) - [video](https://www.youtube.com/watch?v=ifma8G7LegE)
10 |    - Alternative lecture by John Schulmann (english) - [video](https://www.youtube.com/watch?v=IL3gVyJMmhg)
11 |    - Blog post on q-learning Vs SARSA - [url](https://studywolf.wordpress.com/2013/07/01/reinforcement-learning-sarsa-vs-q-learning/)
12 | 
13 | ### More materials
14 | * N-step temporal difference from Sutton's book - [suttonbook](http://incompleteideas.net/book/bookdraft2018jan1.pdf) __chapter 7__
15 | * Eligibility traces from Sutton's book - [suttonbook](http://incompleteideas.net/book/bookdraft2018jan1.pdf) __chapter 12__
16 | * Blog post on eligibility traces - [url](http://pierrelucbacon.com/traces/)
17 | 
18 | ### Assignments
19 | 
20 | This week's practice will require you to pick __one of__ `./seminar_main` and `./seminar_alternative` as first part.
21 | 
22 | Then `./homework` and follow instructions in `./homework/homework.ipynb`
23 | 
24 | Below are some guidelines on what to do in seminar_main/_alternative.
25 | 
26 | ### ./seminar_main
27 | _this assignment borrows code from awesome [cs188](http://ai.berkeley.edu/project_overview.html)_
28 | This homework assignment works on __python2 only__. If you stick to py3, consider seminar_alternative. Or just install it for this homework alone and remove afterwards.
29 | 
30 | This homework also requires some physical display (e.g. laptop monitor). It won't work on binder VM / headless server. Please run it on laptop or consider ./seminar_alternative
31 | 
32 | 
33 | * You need to implement **QLearining** algorithm.  If you're running go to ```seminar_main/``` folder and open file ```qlearningAgent.py```.
34 | 
35 | Once you're done, run use those commands:
36 | ```
37 | python crawler.py # Crawler with qlearning
38 | python pacman.py -p <your agent> -x <number of train samples> -n <total number of samples> -l <grid env>
39 | python pacman.py -p PacmanQAgent -x 5000 -n 5010 -l smallGrid # example
40 | ```
41 | * Make sure you can tune agent to beat ./run_crawler.sh
42 |  * on windows, just run `python crawler.py` from cmd in the project directory
43 | * other ./run* files are mostly for your amusement. 
44 |   * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/master/week3/seminar_main/run_pacman.sh)
45 |   * on windows, just copy the type `python pacman.py -p PacmanQAgent -x 2000 -n 2010 -l smallGrid` in cmd from assignemnt dir
46 | (YSDA/HSE) Please submit only qlearningAgents.py file and include a brief text report as comments in it.
47 | 
48 | ### ./seminar_alternative
49 | 
50 | You'll have to implement qlearning.py just like in main seminar, but in ./seminar_alternative folder. After you're done with it, open the seminar notebook and follow instructions from there.
51 | 
52 | 


--------------------------------------------------------------------------------
/week3_model_free/homework/expected_value_sarsa.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import random, math
  3 | import numpy as np
  4 | 
  5 | class EVSarsaAgent:
  6 |     def __init__(self, alpha, epsilon, discount, get_legal_actions):
  7 |         """
  8 |         Expected Value SARSA Agent.
  9 | 
 10 |         The two main methods are 
 11 |         - self.getAction(state) - returns agent's action in that state
 12 |         - self.update(state,action,nextState,reward) - returns agent's next action
 13 | 
 14 |         Instance variables you have access to
 15 |           - self.epsilon (exploration prob)
 16 |           - self.alpha (learning rate)
 17 |           - self.discount (discount rate aka gamma)
 18 | 
 19 |         """
 20 | 
 21 |         self.get_legal_actions = get_legal_actions
 22 |         self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
 23 |         self.alpha = alpha
 24 |         self.epsilon = epsilon
 25 |         self.discount = discount
 26 | 
 27 |     def get_qvalue(self, state, action):
 28 |         """ Returns Q(state,action) """
 29 |         return self._qvalues[state][action]
 30 | 
 31 |     def set_qvalue(self,state,action,value):
 32 |         """ Sets the Qvalue for [state,action] to the given value """
 33 |         self._qvalues[state][action] = value
 34 | 
 35 |     #---------------------START OF YOUR CODE---------------------#
 36 | 
 37 |     def get_value(self, state):
 38 |         """ 
 39 |         Returns Vpi for current state under epsilon-greedy policy:
 40 |           V_{pi}(s) = sum _{over a_i} {pi(a_i | s) * Q(s, a_i)}
 41 |           
 42 |         Hint: all other methods from QLearningAgent are still accessible.
 43 |         """
 44 |         epsilon = self.epsilon
 45 |         possible_actions = self.get_legal_actions(state)
 46 | 
 47 |         #If there are no legal actions, return 0.0
 48 |         if len(possible_actions) == 0:
 49 |             return 0.0
 50 | 
 51 |         
 52 |         #<YOUR CODE HERE: SEE DOCSTRING>
 53 |         possible_values = [self.get_qvalue(state,action) for action in possible_actions]
 54 |         index = np.argmax(possible_values)
 55 |         state_value = epsilon * possible_values[index] + (1 - epsilon)*(np.sum(possible_values))/len(possible_actions)
 56 |         
 57 |         return state_value
 58 | 
 59 |     def update(self, state, action, reward, next_state):
 60 |         """
 61 |         You should do your Q-Value update here:
 62 |            Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
 63 |         """
 64 | 
 65 |         #agent parameters
 66 |         gamma = self.discount
 67 |         learning_rate = self.alpha
 68 | 
 69 |         #<YOUR CODE HERE>
 70 |         q_value = (1-learning_rate)*self.get_qvalue(state,action) + learning_rate*(reward + gamma*self.get_value(next_state))
 71 |         
 72 |         self.set_qvalue(state, action, q_value)
 73 | 
 74 |     
 75 |     def get_best_action(self, state):
 76 |         """
 77 |         Compute the best action to take in a state (using current q-values). 
 78 |         """
 79 |         possible_actions = self.get_legal_actions(state)
 80 | 
 81 |         #If there are no legal actions, return None
 82 |         if len(possible_actions) == 0:
 83 |             return None
 84 | 
 85 |         possible_q_values = [self.get_qvalue(state,action) for action in possible_actions]
 86 |         index = np.argmax(possible_q_values)
 87 |         best_action =  possible_actions[index]
 88 | 
 89 |         return best_action
 90 | 
 91 |     def get_action(self, state):
 92 |         """
 93 |         Compute the action to take in the current state, including exploration.  
 94 |         With probability self.epsilon, we should take a random action.
 95 |             otherwise - the best policy action (self.getPolicy).
 96 |         
 97 |         Note: To pick randomly from a list, use random.choice(list). 
 98 |               To pick True or False with a given probablity, generate uniform number in [0, 1]
 99 |               and compare it with your probability
100 |         """
101 | 
102 |         # Pick Action
103 |         possible_actions = self.get_legal_actions(state)
104 |         action = None
105 | 
106 |         #If there are no legal actions, return None
107 |         if len(possible_actions) == 0:
108 |             return None
109 | 
110 |         #agent parameters:
111 |         epsilon = self.epsilon
112 | 
113 |         #<YOUR CODE HERE>
114 |         choice = np.random.random() > epsilon
115 |         
116 |         if choice:
117 |             chosen_action = self.get_best_action(state)
118 |         else:
119 |             chosen_action = random.choice(possible_actions)
120 |         
121 |         return chosen_action


--------------------------------------------------------------------------------
/week3_model_free/homework/expected_value_sarsa_epsilon_annealing.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import random, math
  3 | import numpy as np
  4 | 
  5 | class EVSarsaAgent:
  6 |     def __init__(self, alpha, epsilon, discount, get_legal_actions):
  7 |         """
  8 |         Expected Value SARSA Agent.
  9 | 
 10 |         The two main methods are 
 11 |         - self.getAction(state) - returns agent's action in that state
 12 |         - self.update(state,action,nextState,reward) - returns agent's next action
 13 | 
 14 |         Instance variables you have access to
 15 |           - self.epsilon (exploration prob)
 16 |           - self.alpha (learning rate)
 17 |           - self.discount (discount rate aka gamma)
 18 | 
 19 |         """
 20 | 
 21 |         self.get_legal_actions = get_legal_actions
 22 |         self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
 23 |         self.alpha = alpha
 24 |         self.epsilon = epsilon
 25 |         self.discount = discount
 26 | 
 27 |     def get_qvalue(self, state, action):
 28 |         """ Returns Q(state,action) """
 29 |         return self._qvalues[state][action]
 30 | 
 31 |     def set_qvalue(self,state,action,value):
 32 |         """ Sets the Qvalue for [state,action] to the given value """
 33 |         self._qvalues[state][action] = value
 34 | 
 35 |     #---------------------START OF YOUR CODE---------------------#
 36 | 
 37 |     def get_value(self, state):
 38 |         """ 
 39 |         Returns Vpi for current state under epsilon-greedy policy:
 40 |           V_{pi}(s) = sum _{over a_i} {pi(a_i | s) * Q(s, a_i)}
 41 |           
 42 |         Hint: all other methods from QLearningAgent are still accessible.
 43 |         """
 44 |         epsilon = self.epsilon
 45 |         possible_actions = self.get_legal_actions(state)
 46 | 
 47 |         #If there are no legal actions, return 0.0
 48 |         if len(possible_actions) == 0:
 49 |             return 0.0
 50 | 
 51 |         
 52 |         #<YOUR CODE HERE: SEE DOCSTRING>
 53 |         possible_values = [self.get_qvalue(state,action) for action in possible_actions]
 54 |         index = np.argmax(possible_values)
 55 |         state_value = epsilon * possible_values[index] + (1 - epsilon)*(np.sum(possible_values))/len(possible_actions)
 56 |         
 57 |         return state_value
 58 | 
 59 |     def update(self, state, action, reward, next_state):
 60 |         """
 61 |         You should do your Q-Value update here:
 62 |            Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
 63 |         """
 64 | 
 65 |         #agent parameters
 66 |         gamma = self.discount
 67 |         learning_rate = self.alpha
 68 | 
 69 |         #<YOUR CODE HERE>
 70 |         q_value = (1-learning_rate)*self.get_qvalue(state,action) + learning_rate*(reward + gamma*self.get_value(next_state))
 71 |         
 72 |         self.set_qvalue(state, action, q_value)
 73 | 
 74 |     
 75 |     def get_best_action(self, state):
 76 |         """
 77 |         Compute the best action to take in a state (using current q-values). 
 78 |         """
 79 |         possible_actions = self.get_legal_actions(state)
 80 | 
 81 |         #If there are no legal actions, return None
 82 |         if len(possible_actions) == 0:
 83 |             return None
 84 | 
 85 |         possible_q_values = [self.get_qvalue(state,action) for action in possible_actions]
 86 |         index = np.argmax(possible_q_values)
 87 |         best_action =  possible_actions[index]
 88 | 
 89 |         return best_action
 90 | 
 91 |     def get_action(self, state):
 92 |         """
 93 |         Compute the action to take in the current state, including exploration.  
 94 |         With probability self.epsilon, we should take a random action.
 95 |             otherwise - the best policy action (self.getPolicy).
 96 |         
 97 |         Note: To pick randomly from a list, use random.choice(list). 
 98 |               To pick True or False with a given probablity, generate uniform number in [0, 1]
 99 |               and compare it with your probability
100 |         """
101 | 
102 |         # Pick Action
103 |         possible_actions = self.get_legal_actions(state)
104 |         action = None
105 | 
106 |         #If there are no legal actions, return None
107 |         if len(possible_actions) == 0:
108 |             return None
109 | 
110 |         #agent parameters:
111 |         epsilon = self.epsilon
112 |         self.epsilon = 0.99*epsilon
113 | 
114 |         #<YOUR CODE HERE>
115 |         choice = np.random.random() > epsilon
116 |         
117 |         if choice:
118 |             chosen_action = self.get_best_action(state)
119 |         else:
120 |             chosen_action = random.choice(possible_actions)
121 |         
122 |         return chosen_action


--------------------------------------------------------------------------------
/week3_model_free/homework/q_learning_epsilon_annealing.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import random, math
  3 | import numpy as np
  4 | 
  5 | class QLearningAgent:
  6 |     def __init__(self, alpha, epsilon, discount, get_legal_actions):
  7 |         """
  8 |         Q-Learning Agent
  9 |         based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 10 |         Instance variables you have access to
 11 |           - self.epsilon (exploration prob)
 12 |           - self.alpha (learning rate)
 13 |           - self.discount (discount rate aka gamma)
 14 | 
 15 |         Functions you should use
 16 |           - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}
 17 |             which returns legal actions for a state
 18 |           - self.get_qvalue(state,action)
 19 |             which returns Q(state,action)
 20 |           - self.set_qvalue(state,action,value)
 21 |             which sets Q(state,action) := value
 22 | 
 23 |         !!!Important!!!
 24 |         Note: please avoid using self._qValues directly. 
 25 |             There's a special self.get_qvalue/set_qvalue for that.
 26 |         """
 27 | 
 28 |         self.get_legal_actions = get_legal_actions
 29 |         self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
 30 |         self.alpha = alpha
 31 |         self.epsilon = epsilon
 32 |         self.discount = discount
 33 | 
 34 |     def get_qvalue(self, state, action):
 35 |         """ Returns Q(state,action) """
 36 |         return self._qvalues[state][action]
 37 | 
 38 |     def set_qvalue(self,state,action,value):
 39 |         """ Sets the Qvalue for [state,action] to the given value """
 40 |         self._qvalues[state][action] = value
 41 | 
 42 |     #---------------------START OF YOUR CODE---------------------#
 43 | 
 44 |     def get_value(self, state):
 45 |         """
 46 |         Compute your agent's estimate of V(s) using current q-values
 47 |         V(s) = max_over_action Q(state,action) over possible actions.
 48 |         Note: please take into account that q-values can be negative.
 49 |         """
 50 |         possible_actions = self.get_legal_actions(state)
 51 | 
 52 |         #If there are no legal actions, return 0.0
 53 |         if len(possible_actions) == 0:
 54 |             return 0.0
 55 |             
 56 | 
 57 |         #<YOUR CODE HERE>
 58 |         possible_values = [self.get_qvalue(state,action) for action in possible_actions]
 59 |         state_value = np.max(possible_values)
 60 | 
 61 |         return state_value
 62 | 
 63 |     def update(self, state, action, reward, next_state):
 64 |         """
 65 |         You should do your Q-Value update here:
 66 |            Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
 67 |         """
 68 | 
 69 |         #agent parameters
 70 |         gamma = self.discount
 71 |         learning_rate = self.alpha
 72 | 
 73 |         #<YOUR CODE HERE>
 74 |         q_value = (1-learning_rate)*self.get_qvalue(state,action) + learning_rate*(reward + gamma*self.get_value(next_state))
 75 |         
 76 |         self.set_qvalue(state, action, q_value)
 77 | 
 78 |     
 79 |     def get_best_action(self, state):
 80 |         """
 81 |         Compute the best action to take in a state (using current q-values). 
 82 |         """
 83 |         possible_actions = self.get_legal_actions(state)
 84 | 
 85 |         #If there are no legal actions, return None
 86 |         if len(possible_actions) == 0:
 87 |             return None
 88 | 
 89 |         possible_q_values = [self.get_qvalue(state,action) for action in possible_actions]
 90 |         index = np.argmax(possible_q_values)
 91 |         best_action =  possible_actions[index]
 92 | 
 93 |         return best_action
 94 | 
 95 |     def get_action(self, state):
 96 |         """
 97 |         Compute the action to take in the current state, including exploration.  
 98 |         With probability self.epsilon, we should take a random action.
 99 |             otherwise - the best policy action (self.getPolicy).
100 |         
101 |         Note: To pick randomly from a list, use random.choice(list). 
102 |               To pick True or False with a given probablity, generate uniform number in [0, 1]
103 |               and compare it with your probability
104 |         """
105 | 
106 |         # Pick Action
107 |         possible_actions = self.get_legal_actions(state)
108 |         action = None
109 | 
110 |         #If there are no legal actions, return None
111 |         if len(possible_actions) == 0:
112 |             return None
113 | 
114 |         #agent parameters:
115 |         epsilon = self.epsilon
116 |         self.epsilon = 0.99 * epsilon
117 | 
118 |         #<YOUR CODE HERE>
119 |         choice = np.random.random() > epsilon
120 |         
121 |         if choice:
122 |             chosen_action = self.get_best_action(state)
123 |         else:
124 |             chosen_action = random.choice(possible_actions)
125 |         
126 |         return chosen_action


--------------------------------------------------------------------------------
/week3_model_free/homework/qlearning.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import random, math
  3 | import numpy as np
  4 | 
  5 | class QLearningAgent:
  6 |     def __init__(self, alpha, epsilon, discount, get_legal_actions):
  7 |         """
  8 |         Q-Learning Agent
  9 |         based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 10 |         Instance variables you have access to
 11 |           - self.epsilon (exploration prob)
 12 |           - self.alpha (learning rate)
 13 |           - self.discount (discount rate aka gamma)
 14 | 
 15 |         Functions you should use
 16 |           - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}
 17 |             which returns legal actions for a state
 18 |           - self.get_qvalue(state,action)
 19 |             which returns Q(state,action)
 20 |           - self.set_qvalue(state,action,value)
 21 |             which sets Q(state,action) := value
 22 | 
 23 |         !!!Important!!!
 24 |         Note: please avoid using self._qValues directly. 
 25 |             There's a special self.get_qvalue/set_qvalue for that.
 26 |         """
 27 | 
 28 |         self.get_legal_actions = get_legal_actions
 29 |         self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
 30 |         self.alpha = alpha
 31 |         self.epsilon = epsilon
 32 |         self.discount = discount
 33 | 
 34 |     def get_qvalue(self, state, action):
 35 |         """ Returns Q(state,action) """
 36 |         return self._qvalues[state][action]
 37 | 
 38 |     def set_qvalue(self,state,action,value):
 39 |         """ Sets the Qvalue for [state,action] to the given value """
 40 |         self._qvalues[state][action] = value
 41 | 
 42 |     #---------------------START OF YOUR CODE---------------------#
 43 | 
 44 |     def get_value(self, state):
 45 |         """
 46 |         Compute your agent's estimate of V(s) using current q-values
 47 |         V(s) = max_over_action Q(state,action) over possible actions.
 48 |         Note: please take into account that q-values can be negative.
 49 |         """
 50 |         possible_actions = self.get_legal_actions(state)
 51 | 
 52 |         #If there are no legal actions, return 0.0
 53 |         if len(possible_actions) == 0:
 54 |             return 0.0
 55 |             
 56 | 
 57 |         #<YOUR CODE HERE>
 58 |         possible_values = [self.get_qvalue(state,action) for action in possible_actions]
 59 |         state_value = np.max(possible_values)
 60 | 
 61 |         return state_value
 62 | 
 63 |     def update(self, state, action, reward, next_state):
 64 |         """
 65 |         You should do your Q-Value update here:
 66 |            Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
 67 |         """
 68 | 
 69 |         #agent parameters
 70 |         gamma = self.discount
 71 |         learning_rate = self.alpha
 72 | 
 73 |         #<YOUR CODE HERE>
 74 |         q_value = (1-learning_rate)*self.get_qvalue(state,action) + learning_rate*(reward + gamma*self.get_value(next_state))
 75 |         
 76 |         self.set_qvalue(state, action, q_value)
 77 | 
 78 |     
 79 |     def get_best_action(self, state):
 80 |         """
 81 |         Compute the best action to take in a state (using current q-values). 
 82 |         """
 83 |         possible_actions = self.get_legal_actions(state)
 84 | 
 85 |         #If there are no legal actions, return None
 86 |         if len(possible_actions) == 0:
 87 |             return None
 88 | 
 89 |         possible_q_values = [self.get_qvalue(state,action) for action in possible_actions]
 90 |         index = np.argmax(possible_q_values)
 91 |         best_action =  possible_actions[index]
 92 | 
 93 |         return best_action
 94 | 
 95 |     def get_action(self, state):
 96 |         """
 97 |         Compute the action to take in the current state, including exploration.  
 98 |         With probability self.epsilon, we should take a random action.
 99 |             otherwise - the best policy action (self.getPolicy).
100 |         
101 |         Note: To pick randomly from a list, use random.choice(list). 
102 |               To pick True or False with a given probablity, generate uniform number in [0, 1]
103 |               and compare it with your probability
104 |         """
105 | 
106 |         # Pick Action
107 |         possible_actions = self.get_legal_actions(state)
108 |         action = None
109 | 
110 |         #If there are no legal actions, return None
111 |         if len(possible_actions) == 0:
112 |             return None
113 | 
114 |         #agent parameters:
115 |         epsilon = self.epsilon
116 | 
117 |         #<YOUR CODE HERE>
118 |         choice = np.random.random() > epsilon
119 |         
120 |         if choice:
121 |             chosen_action = self.get_best_action(state)
122 |         else:
123 |             chosen_action = random.choice(possible_actions)
124 |         
125 |         return chosen_action


--------------------------------------------------------------------------------
/week3_model_free/homework/sarsa.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Expected Value SARSA
  3 | This file builds upon the same functions as Q-learning agent (qlearning.py).
  4 | 
  5 | [assignment]
  6 | The only thing you must implement is the getValue method.
  7 | - Recall that V(s) in SARSA is not the maximal but the expected Q-value.
  8 | - The expectation should be done under agent's policy (e-greedy).
  9 | 
 10 | 
 11 | Here's usage example:
 12 | >>>from sarsa import SarsaAgent
 13 | 
 14 | >>>agent = SarsaAgent(alpha=0.1,epsilon=0.25,discount=0.99,
 15 |                        getLegalActions = lambda s: actions_from_that_state)
 16 | >>>action = agent.getAction(state)
 17 | >>>agent.update(state,action, next_state,reward)
 18 | >>>agent.epsilon *= 0.99
 19 | """
 20 | import random,math
 21 | 
 22 | import numpy as np
 23 | from collections import defaultdict
 24 | 
 25 | class SarsaAgent():
 26 |   """
 27 |     Classical SARSA agent.
 28 |     
 29 |     The two main methods are 
 30 |     - self.getAction(state) - returns agent's action in that state
 31 |     - self.update(state,action,reward,nextState,nextAction) - returns agent's next action
 32 | 
 33 |     Instance variables you have access to
 34 |       - self.epsilon (exploration prob)
 35 |       - self.alpha (learning rate)
 36 |       - self.discount (discount rate aka gamma)
 37 | 
 38 |   """
 39 |   def __init__(self,alpha,epsilon,discount,getLegalActions):
 40 |     "We initialize agent and Q-values here."
 41 |     self.getLegalActions= getLegalActions
 42 |     self._qValues = defaultdict(lambda:defaultdict(lambda:0))
 43 |     self.alpha = alpha
 44 |     self.epsilon = epsilon
 45 |     self.discount = discount
 46 | 
 47 |   def getQValue(self, state, action):
 48 |     """
 49 |       Returns Q(state,action)
 50 |     """
 51 |     return self._qValues[state][action]
 52 | 
 53 |   def setQValue(self,state,action,value):
 54 |     """
 55 |       Sets the Qvalue for [state,action] to the given value
 56 |     """
 57 |     self._qValues[state][action] = value
 58 | 
 59 | #---------------------#start of your code#---------------------#
 60 | 
 61 |   def getPolicy(self, state):
 62 |     """
 63 |       Compute the best action to take in a state. 
 64 |       
 65 |     """
 66 |     possibleActions = self.getLegalActions(state)
 67 | 
 68 |     #If there are no legal actions, return None
 69 |     if len(possibleActions) == 0:
 70 |     	return None
 71 |     
 72 |     best_action = None
 73 | 
 74 |     "*** this code works exactly as Q-learning ***"
 75 |     best_action = possibleActions[np.argmax([self.getQValue(state, a) for a in possibleActions])]
 76 |     return best_action
 77 | 
 78 |   def getAction(self, state):
 79 |     """
 80 |       Compute the action to take in the current state, including exploration.  
 81 |       
 82 |       With probability self.epsilon, we should take a random action.
 83 |       otherwise - the best policy action (self.getPolicy).
 84 | 
 85 |       HINT: You might want to use util.flipCoin(prob)
 86 |       HINT: To pick randomly from a list, use random.choice(list)
 87 | 
 88 |     """
 89 |     
 90 |     # Pick Action
 91 |     possibleActions = self.getLegalActions(state)
 92 |     action = None
 93 |     
 94 |     #If there are no legal actions, return None
 95 |     if len(possibleActions) == 0:
 96 |     	return None
 97 | 
 98 |     #agent parameters:
 99 |     epsilon = self.epsilon
100 | 
101 |     "*** Epsilon-greedy strategy exactly as Q-learning ***"
102 |     if np.random.random()<=epsilon:
103 |     	return random.choice(possibleActions)
104 |     else:
105 |     	action = self.getPolicy(state)
106 |     return action
107 | 
108 |   def update(self, state, action, nextState,nextAction, reward):
109 |     """
110 |       You should do your Q-Value update here
111 | 
112 |       NOTE: You should never call this function,
113 |       it will be called on your behalf
114 | 
115 | 
116 |     """
117 |     #agent parameters
118 |     gamma = self.discount
119 |     learning_rate = self.alpha
120 |     
121 |     "*** YOUR CODE HERE ***"    
122 |     reference_qvalue = <Your Code Here>
123 |     
124 |     updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue
125 |     
126 |     self.setQValue(state,action,updated_qvalue)
127 | 
128 | 
129 | #---------------------#end of your code#---------------------#
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_alternative/qlearning.py:
--------------------------------------------------------------------------------
  1 | # qlearningAgents.py
  2 | # ------------------
  3 | ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  4 | 
  5 | import random,math
  6 | 
  7 | import numpy as np
  8 | from collections import defaultdict
  9 | 
 10 | class QLearningAgent():
 11 |   """
 12 |     Q-Learning Agent
 13 | 
 14 |     Instance variables you have access to
 15 |       - self.epsilon (exploration prob)
 16 |       - self.alpha (learning rate)
 17 |       - self.discount (discount rate aka gamma)
 18 | 
 19 |     Functions you should use
 20 |       - self.getLegalActions(state)
 21 |         which returns legal actions for a state
 22 |       - self.getQValue(state,action)
 23 |         which returns Q(state,action)
 24 |       - self.setQValue(state,action,value)
 25 |         which sets Q(state,action) := value
 26 |     
 27 |     !!!Important!!!
 28 |     NOTE: please avoid using self._qValues directly to make code cleaner
 29 |   """
 30 |   def __init__(self,alpha,epsilon,discount,getLegalActions):
 31 |     "We initialize agent and Q-values here."
 32 |     self.getLegalActions= getLegalActions
 33 |     self._qValues = defaultdict(lambda:defaultdict(lambda:0))
 34 |     self.alpha = alpha
 35 |     self.epsilon = epsilon
 36 |     self.discount = discount
 37 | 
 38 |   def getQValue(self, state, action):
 39 |     """
 40 |       Returns Q(state,action)
 41 |     """
 42 |     return self._qValues[state][action]
 43 | 
 44 |   def setQValue(self,state,action,value):
 45 |     """
 46 |       Sets the Qvalue for [state,action] to the given value
 47 |     """
 48 |     self._qValues[state][action] = value
 49 | 
 50 | #---------------------#start of your code#---------------------#
 51 | 
 52 |   def getValue(self, state):
 53 |     """
 54 |       Returns max_action Q(state,action)
 55 |       where the max is over legal actions.
 56 |     """
 57 |     
 58 |     possibleActions = self.getLegalActions(state)
 59 |     #If there are no legal actions, return 0.0
 60 |     if len(possibleActions) == 0:
 61 |     	return 0.0
 62 | 
 63 |     "*** YOUR CODE HERE ***"
 64 |     return <compute state value>
 65 |     
 66 |   def getPolicy(self, state):
 67 |     """
 68 |       Compute the best action to take in a state. 
 69 |       
 70 |     """
 71 |     possibleActions = self.getLegalActions(state)
 72 | 
 73 |     #If there are no legal actions, return None
 74 |     if len(possibleActions) == 0:
 75 |     	return None
 76 |     
 77 |     best_action = None
 78 | 
 79 |     "*** YOUR CODE HERE ***"
 80 |     best_action = <your code>
 81 |     return best_action
 82 | 
 83 |   def getAction(self, state):
 84 |     """
 85 |       Compute the action to take in the current state, including exploration.  
 86 |       
 87 |       With probability self.epsilon, we should take a random action.
 88 |       otherwise - the best policy action (self.getPolicy).
 89 | 
 90 |       HINT: You might want to use util.flipCoin(prob)
 91 |       HINT: To pick randomly from a list, use random.choice(list)
 92 | 
 93 |     """
 94 |     
 95 |     # Pick Action
 96 |     possibleActions = self.getLegalActions(state)
 97 |     action = None
 98 |     
 99 |     #If there are no legal actions, return None
100 |     if len(possibleActions) == 0:
101 |     	return None
102 | 
103 |     #agent parameters:
104 |     epsilon = self.epsilon
105 | 
106 |     "*** YOUR CODE HERE ***"
107 |     
108 |     return <put agent's action here>
109 | 
110 |   def update(self, state, action, nextState, reward):
111 |     """
112 |       You should do your Q-Value update here
113 | 
114 |       NOTE: You should never call this function,
115 |       it will be called on your behalf
116 | 
117 | 
118 |     """
119 |     #agent parameters
120 |     gamma = self.discount
121 |     learning_rate = self.alpha
122 |     
123 |     "*** YOUR CODE HERE ***"    
124 |     reference_qvalue = <the "correct state value", uses reward and the value of next state>
125 |     
126 |     updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue
127 |     self.setQValue(state,action,updated_qvalue)
128 | 
129 | 
130 | #---------------------#end of your code#---------------------#
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/analysis.py:
--------------------------------------------------------------------------------
 1 | # analysis.py
 2 | # -----------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | ######################
10 | # ANALYSIS QUESTIONS #
11 | ######################
12 | 
13 | # Change these default values to obtain the specified policies through
14 | # value iteration.
15 | 
16 | def question2a():
17 |   answerDiscount = 0.9
18 |   answerNoise = 0.2
19 |   answerLivingReward = 0.0
20 |   return answerDiscount, answerNoise, answerLivingReward
21 |   # If not possible, return 'NOT POSSIBLE'
22 | 
23 | def question2b():
24 |   answerDiscount = 0.9
25 |   answerNoise = 0.2
26 |   answerLivingReward = 0.0
27 |   return answerDiscount, answerNoise, answerLivingReward
28 |   # If not possible, return 'NOT POSSIBLE'
29 | 
30 | def question2c():
31 |   answerDiscount = 0.9
32 |   answerNoise = 0.2
33 |   answerLivingReward = 0.0
34 |   return answerDiscount, answerNoise, answerLivingReward
35 |   # If not possible, return 'NOT POSSIBLE'
36 | 
37 | def question2d():
38 |   answerDiscount = 0.9
39 |   answerNoise = 0.2
40 |   answerLivingReward = 0.0
41 |   return answerDiscount, answerNoise, answerLivingReward
42 |   # If not possible, return 'NOT POSSIBLE'
43 | 
44 | def question2e():
45 |   answerDiscount = 0.9
46 |   answerNoise = 0.2
47 |   answerLivingReward = 0.0
48 |   return answerDiscount, answerNoise, answerLivingReward
49 |   # If not possible, return 'NOT POSSIBLE'
50 | 
51 | if __name__ == '__main__':
52 |   print 'Answers to analysis questions:'
53 |   import analysis
54 |   for q in [q for q in dir(analysis) if q.startswith('question')]:
55 |     response = getattr(analysis, q)()
56 |     print '  Question %s:\t%s' % (q, str(response))
57 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/environment.py:
--------------------------------------------------------------------------------
 1 | # environment.py
 2 | # --------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | #!/usr/bin/python
10 | 
11 | class Environment:
12 |         
13 |   def getCurrentState(self):
14 |     """
15 |     Returns the current state of enviornment
16 |     """
17 |     abstract
18 |     
19 |   def getPossibleActions(self, state):
20 |     """
21 |       Returns possible actions the agent 
22 |       can take in the given state. Can
23 |       return the empty list if we are in 
24 |       a terminal state.
25 |     """
26 |     abstract
27 |                 
28 |   def doAction(self, action):
29 |     """
30 |       Performs the given action in the current
31 |       environment state and updates the enviornment.
32 |     
33 |       Returns a (reward, nextState) pair
34 |     """
35 |     abstract
36 |         
37 |   def reset(self):
38 |     """
39 |       Resets the current state to the start state
40 |     """
41 |     abstract
42 | 
43 |   def isTerminal(self):
44 |     """
45 |       Has the enviornment entered a terminal
46 |       state? This means there are no successors
47 |     """
48 |     state = self.getCurrentState()
49 |     actions = self.getPossibleActions(state)
50 |     return len(actions) == 0
51 |     


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/featureExtractors.py:
--------------------------------------------------------------------------------
 1 | # featureExtractors.py
 2 | # --------------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | "Feature extractors for Pacman game states"
10 | 
11 | from game import Directions, Actions
12 | import util
13 | 
14 | class FeatureExtractor:  
15 |   def getFeatures(self, state, action):    
16 |     """
17 |       Returns a dict from features to counts
18 |       Usually, the count will just be 1.0 for
19 |       indicator functions.  
20 |     """
21 |     util.raiseNotDefined()
22 | 
23 | class IdentityExtractor(FeatureExtractor):
24 |   def getFeatures(self, state, action):
25 |     feats = util.Counter()
26 |     feats[(state,action)] = 1.0
27 |     return feats
28 | 
29 | def closestFood(pos, food, walls):
30 |   """
31 |   closestFood -- this is similar to the function that we have
32 |   worked on in the search project; here its all in one place
33 |   """
34 |   fringe = [(pos[0], pos[1], 0)]
35 |   expanded = set()
36 |   while fringe:
37 |     pos_x, pos_y, dist = fringe.pop(0)
38 |     if (pos_x, pos_y) in expanded:
39 |       continue
40 |     expanded.add((pos_x, pos_y))
41 |     # if we find a food at this location then exit
42 |     if food[pos_x][pos_y]:
43 |       return dist
44 |     # otherwise spread out from the location to its neighbours
45 |     nbrs = Actions.getLegalNeighbors((pos_x, pos_y), walls)
46 |     for nbr_x, nbr_y in nbrs:
47 |       fringe.append((nbr_x, nbr_y, dist+1))
48 |   # no food found
49 |   return None
50 | 
51 | class SimpleExtractor(FeatureExtractor):
52 |   """
53 |   Returns simple features for a basic reflex Pacman:
54 |   - whether food will be eaten
55 |   - how far away the next food is
56 |   - whether a ghost collision is imminent
57 |   - whether a ghost is one step away
58 |   """
59 |   
60 |   def getFeatures(self, state, action):
61 |     # extract the grid of food and wall locations and get the ghost locations
62 |     food = state.getFood()
63 |     walls = state.getWalls()
64 |     ghosts = state.getGhostPositions()
65 | 
66 |     features = util.Counter()
67 |     
68 |     features["bias"] = 1.0
69 |     
70 |     # compute the location of pacman after he takes the action
71 |     x, y = state.getPacmanPosition()
72 |     dx, dy = Actions.directionToVector(action)
73 |     next_x, next_y = int(x + dx), int(y + dy)
74 |     
75 |     # count the number of ghosts 1-step away
76 |     features["#-of-ghosts-1-step-away"] = sum((next_x, next_y) in Actions.getLegalNeighbors(g, walls) for g in ghosts)
77 | 
78 |     # if there is no danger of ghosts then add the food feature
79 |     if not features["#-of-ghosts-1-step-away"] and food[next_x][next_y]:
80 |       features["eats-food"] = 1.0
81 |     
82 |     dist = closestFood((next_x, next_y), food, walls)
83 |     if dist is not None:
84 |       # make the distance a number less than one otherwise the update
85 |       # will diverge wildly
86 |       features["closest-food"] = float(dist) / (walls.width * walls.height) 
87 |     features.divideAll(10.0)
88 |     return features


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/ghostAgents.py:
--------------------------------------------------------------------------------
 1 | # ghostAgents.py
 2 | # --------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | from game import Agent
10 | from game import Actions
11 | from game import Directions
12 | import random
13 | from util import manhattanDistance
14 | import util
15 | 
16 | class GhostAgent( Agent ):
17 |   def __init__( self, index ):
18 |     self.index = index
19 | 
20 |   def getAction( self, state ):
21 |     dist = self.getDistribution(state)
22 |     if len(dist) == 0: 
23 |       return Directions.STOP
24 |     else:
25 |       return util.chooseFromDistribution( dist )
26 |     
27 |   def getDistribution(self, state):
28 |     "Returns a Counter encoding a distribution over actions from the provided state."
29 |     util.raiseNotDefined()
30 | 
31 | class RandomGhost( GhostAgent ):
32 |   "A ghost that chooses a legal action uniformly at random."
33 |   def getDistribution( self, state ):
34 |     dist = util.Counter()
35 |     for a in state.getLegalActions( self.index ): dist[a] = 1.0
36 |     dist.normalize()
37 |     return dist
38 | 
39 | class DirectionalGhost( GhostAgent ):
40 |   "A ghost that prefers to rush Pacman, or flee when scared."
41 |   def __init__( self, index, prob_attack=0.8, prob_scaredFlee=0.8 ):
42 |     self.index = index
43 |     self.prob_attack = prob_attack
44 |     self.prob_scaredFlee = prob_scaredFlee
45 |       
46 |   def getDistribution( self, state ):
47 |     # Read variables from state
48 |     ghostState = state.getGhostState( self.index )
49 |     legalActions = state.getLegalActions( self.index )
50 |     pos = state.getGhostPosition( self.index )
51 |     isScared = ghostState.scaredTimer > 0
52 |     
53 |     speed = 1
54 |     if isScared: speed = 0.5
55 |     
56 |     actionVectors = [Actions.directionToVector( a, speed ) for a in legalActions]
57 |     newPositions = [( pos[0]+a[0], pos[1]+a[1] ) for a in actionVectors]
58 |     pacmanPosition = state.getPacmanPosition()
59 | 
60 |     # Select best actions given the state
61 |     distancesToPacman = [manhattanDistance( pos, pacmanPosition ) for pos in newPositions]
62 |     if isScared:
63 |       bestScore = max( distancesToPacman )
64 |       bestProb = self.prob_scaredFlee
65 |     else:
66 |       bestScore = min( distancesToPacman )
67 |       bestProb = self.prob_attack
68 |     bestActions = [action for action, distance in zip( legalActions, distancesToPacman ) if distance == bestScore]
69 |     
70 |     # Construct distribution
71 |     dist = util.Counter()
72 |     for a in bestActions: dist[a] = bestProb / len(bestActions)
73 |     for a in legalActions: dist[a] += ( 1-bestProb ) / len(legalActions)
74 |     dist.normalize()
75 |     return dist
76 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/how2run:
--------------------------------------------------------------------------------
1 | python pacman.py -p PacmanQAgent -x 5000 -n 5010 -l smallGrid
2 | python pacman.py -p PacmanQAgent -x 10000 -n 10010 -l mediumGrid
3 | python pacman.py -p PacmanQAgent -x 100 -n 110 -l mediumClassic
4 | python gridworld.py -a q -k 50 -n 0 -g BridgeGrid -e 1
5 | python crawler.py
6 | 
7 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/keyboardAgents.py:
--------------------------------------------------------------------------------
 1 | # keyboardAgents.py
 2 | # -----------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | from game import Agent
10 | from game import Directions
11 | import random
12 | 
13 | class KeyboardAgent(Agent):
14 |   """
15 |   An agent controlled by the keyboard.
16 |   """
17 |   # NOTE: Arrow keys also work.
18 |   WEST_KEY  = 'a' 
19 |   EAST_KEY  = 'd' 
20 |   NORTH_KEY = 'w' 
21 |   SOUTH_KEY = 's'
22 |   STOP_KEY = 'q'
23 | 
24 |   def __init__( self, index = 0 ):
25 |     
26 |     self.lastMove = Directions.STOP
27 |     self.index = index
28 |     self.keys = []
29 |     
30 |   def getAction( self, state):
31 |     from graphicsUtils import keys_waiting
32 |     from graphicsUtils import keys_pressed
33 |     keys = keys_waiting() + keys_pressed()
34 |     if keys != []:
35 |       self.keys = keys
36 |     
37 |     legal = state.getLegalActions(self.index)
38 |     move = self.getMove(legal)
39 |     
40 |     if move == Directions.STOP:
41 |       # Try to move in the same direction as before
42 |       if self.lastMove in legal:
43 |         move = self.lastMove
44 |     
45 |     if (self.STOP_KEY in self.keys) and Directions.STOP in legal: move = Directions.STOP
46 | 
47 |     if move not in legal:
48 |       move = random.choice(legal)
49 |       
50 |     self.lastMove = move
51 |     return move
52 | 
53 |   def getMove(self, legal):
54 |     move = Directions.STOP
55 |     if   (self.WEST_KEY in self.keys or 'Left' in self.keys) and Directions.WEST in legal:  move = Directions.WEST
56 |     if   (self.EAST_KEY in self.keys or 'Right' in self.keys) and Directions.EAST in legal: move = Directions.EAST
57 |     if   (self.NORTH_KEY in self.keys or 'Up' in self.keys) and Directions.NORTH in legal:   move = Directions.NORTH
58 |     if   (self.SOUTH_KEY in self.keys or 'Down' in self.keys) and Directions.SOUTH in legal: move = Directions.SOUTH
59 |     return move
60 |   
61 | class KeyboardAgent2(KeyboardAgent):
62 |   """
63 |   A second agent controlled by the keyboard.
64 |   """
65 |   # NOTE: Arrow keys also work.
66 |   WEST_KEY  = 'j' 
67 |   EAST_KEY  = "l" 
68 |   NORTH_KEY = 'i' 
69 |   SOUTH_KEY = 'k'
70 |   STOP_KEY = 'u'
71 | 
72 |   def getMove(self, legal):
73 |     move = Directions.STOP
74 |     if   (self.WEST_KEY in self.keys) and Directions.WEST in legal:  move = Directions.WEST
75 |     if   (self.EAST_KEY in self.keys) and Directions.EAST in legal: move = Directions.EAST
76 |     if   (self.NORTH_KEY in self.keys) and Directions.NORTH in legal:   move = Directions.NORTH
77 |     if   (self.SOUTH_KEY in self.keys) and Directions.SOUTH in legal: move = Directions.SOUTH
78 |     return move
79 |   
80 |   
81 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layout.py:
--------------------------------------------------------------------------------
  1 | # layout.py
  2 | # ---------
  3 | # Licensing Information: Please do not distribute or publish solutions to this
  4 | # project. You are free to use and extend these projects for educational
  5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
  6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
  7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  8 | 
  9 | from util import manhattanDistance
 10 | from game import Grid
 11 | import os
 12 | import random
 13 | 
 14 | VISIBILITY_MATRIX_CACHE = {}
 15 | 
 16 | class Layout:
 17 |   """
 18 |   A Layout manages the static information about the game board.
 19 |   """
 20 |   
 21 |   def __init__(self, layoutText):
 22 |     self.width = len(layoutText[0])
 23 |     self.height= len(layoutText)
 24 |     self.walls = Grid(self.width, self.height, False)
 25 |     self.food = Grid(self.width, self.height, False)
 26 |     self.capsules = []
 27 |     self.agentPositions = []
 28 |     self.numGhosts = 0
 29 |     self.processLayoutText(layoutText)
 30 |     self.layoutText = layoutText
 31 |     # self.initializeVisibilityMatrix()
 32 |     
 33 |   def getNumGhosts(self):
 34 |     return self.numGhosts
 35 |     
 36 |   def initializeVisibilityMatrix(self):
 37 |     global VISIBILITY_MATRIX_CACHE
 38 |     if reduce(str.__add__, self.layoutText) not in VISIBILITY_MATRIX_CACHE:
 39 |       from game import Directions
 40 |       vecs = [(-0.5,0), (0.5,0),(0,-0.5),(0,0.5)]
 41 |       dirs = [Directions.NORTH, Directions.SOUTH, Directions.WEST, Directions.EAST]
 42 |       vis = Grid(self.width, self.height, {Directions.NORTH:set(), Directions.SOUTH:set(), Directions.EAST:set(), Directions.WEST:set(), Directions.STOP:set()})
 43 |       for x in range(self.width):
 44 |         for y in range(self.height):
 45 |           if self.walls[x][y] == False:
 46 |             for vec, direction in zip(vecs, dirs):
 47 |               dx, dy = vec
 48 |               nextx, nexty = x + dx, y + dy
 49 |               while (nextx + nexty) != int(nextx) + int(nexty) or not self.walls[int(nextx)][int(nexty)] :
 50 |                 vis[x][y][direction].add((nextx, nexty))
 51 |                 nextx, nexty = x + dx, y + dy
 52 |       self.visibility = vis      
 53 |       VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] = vis
 54 |     else:
 55 |       self.visibility = VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)]
 56 |       
 57 |   def isWall(self, pos):
 58 |     x, col = pos
 59 |     return self.walls[x][col]
 60 |   
 61 |   def getRandomLegalPosition(self):
 62 |     x = random.choice(range(self.width))
 63 |     y = random.choice(range(self.height))
 64 |     while self.isWall( (x, y) ):
 65 |       x = random.choice(range(self.width))
 66 |       y = random.choice(range(self.height))
 67 |     return (x,y)
 68 | 
 69 |   def getRandomCorner(self):
 70 |     poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)]
 71 |     return random.choice(poses)
 72 | 
 73 |   def getFurthestCorner(self, pacPos):
 74 |     poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)]
 75 |     dist, pos = max([(manhattanDistance(p, pacPos), p) for p in poses])
 76 |     return pos
 77 |   
 78 |   def isVisibleFrom(self, ghostPos, pacPos, pacDirection):
 79 |     row, col = [int(x) for x in pacPos]
 80 |     return ghostPos in self.visibility[row][col][pacDirection]
 81 |   
 82 |   def __str__(self):
 83 |     return "\n".join(self.layoutText)
 84 |     
 85 |   def deepCopy(self):
 86 |     return Layout(self.layoutText[:])
 87 |     
 88 |   def processLayoutText(self, layoutText):
 89 |     """
 90 |     Coordinates are flipped from the input format to the (x,y) convention here
 91 |     
 92 |     The shape of the maze.  Each character  
 93 |     represents a different type of object.   
 94 |      % - Wall                               
 95 |      . - Food
 96 |      o - Capsule
 97 |      G - Ghost
 98 |      P - Pacman
 99 |     Other characters are ignored.
100 |     """
101 |     maxY = self.height - 1
102 |     for y in range(self.height):       
103 |       for x in range(self.width):
104 |         layoutChar = layoutText[maxY - y][x]  
105 |         self.processLayoutChar(x, y, layoutChar)
106 |     self.agentPositions.sort()
107 |     self.agentPositions = [ ( i == 0, pos) for i, pos in self.agentPositions]
108 |   
109 |   def processLayoutChar(self, x, y, layoutChar):
110 |     if layoutChar == '%':      
111 |       self.walls[x][y] = True
112 |     elif layoutChar == '.':
113 |       self.food[x][y] = True 
114 |     elif layoutChar == 'o':    
115 |       self.capsules.append((x, y))   
116 |     elif layoutChar == 'P':    
117 |       self.agentPositions.append( (0, (x, y) ) )
118 |     elif layoutChar in ['G']:    
119 |       self.agentPositions.append( (1, (x, y) ) )
120 |       self.numGhosts += 1
121 |     elif layoutChar in  ['1', '2', '3', '4']:
122 |       self.agentPositions.append( (int(layoutChar), (x,y)))
123 |       self.numGhosts += 1 
124 | def getLayout(name, back = 2):
125 |   if name.endswith('.lay'):
126 |     layout = tryToLoad('layouts/' + name)
127 |     if layout == None: layout = tryToLoad(name)
128 |   else:
129 |     layout = tryToLoad('layouts/' + name + '.lay')
130 |     if layout == None: layout = tryToLoad(name + '.lay')
131 |   if layout == None and back >= 0:
132 |     curdir = os.path.abspath('.')
133 |     os.chdir('..')
134 |     layout = getLayout(name, back -1)
135 |     os.chdir(curdir)
136 |   return layout
137 | 
138 | def tryToLoad(fullname):
139 |   if(not os.path.exists(fullname)): return None
140 |   f = open(fullname)
141 |   try: return Layout([line.strip() for line in f])
142 |   finally: f.close()


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/capsuleClassic.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%%%%%%%%%%%%
2 | %G.       G   ....%
3 | %.% % %%%%%% %.%%.%
4 | %.%o% %   o% %.o%.%
5 | %.%%%.%  %%% %..%.%
6 | %.....  P    %..%G%
7 | %%%%%%%%%%%%%%%%%%%%
8 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/contestClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%
 2 | %o...%........%...o%
 3 | %.%%.%.%%..%%.%.%%.%
 4 | %...... G GG%......%
 5 | %.%.%%.%% %%%.%%.%.%
 6 | %.%....% ooo%.%..%.%
 7 | %.%.%%.% %% %.%.%%.%
 8 | %o%......P....%....%
 9 | %%%%%%%%%%%%%%%%%%%%
10 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/mediumClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%
 2 | %o...%........%....%
 3 | %.%%.%.%%%%%%.%.%%.%
 4 | %.%..............%.%
 5 | %.%.%%.%%  %%.%%.%.%
 6 | %......%G  G%......%
 7 | %.%.%%.%%%%%%.%%.%.%
 8 | %.%..............%.%
 9 | %.%%.%.%%%%%%.%.%%.%
10 | %....%...P....%...o%
11 | %%%%%%%%%%%%%%%%%%%%
12 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/mediumGrid.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%
2 | %P     %
3 | % .% . %
4 | %  %   %
5 | % .% . %
6 | %     G%
7 | %%%%%%%%
8 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/minimaxClassic.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%%
2 | %.P    G% 
3 | % %.%G%%%  
4 | %G    %%% 
5 | %%%%%%%%%
6 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/openClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%%%%%%
 2 | %.. P  ....      ....   %
 3 | %..  ...  ...  ...  ... %
 4 | %..  ...  ...  ...  ... %
 5 | %..    ....      .... G %
 6 | %..  ...  ...  ...  ... %
 7 | %..  ...  ...  ...  ... %
 8 | %..    ....      ....  o%
 9 | %%%%%%%%%%%%%%%%%%%%%%%%%
10 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/originalClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | %............%%............%
 3 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
 4 | %o%%%%.%%%%%.%%.%%%%%.%%%%o%
 5 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
 6 | %..........................%
 7 | %.%%%%.%%.%%%%%%%%.%%.%%%%.%
 8 | %.%%%%.%%.%%%%%%%%.%%.%%%%.%
 9 | %......%%....%%....%%......%
10 | %%%%%%.%%%%% %% %%%%%.%%%%%%
11 | %%%%%%.%%%%% %% %%%%%.%%%%%%
12 | %%%%%%.%            %.%%%%%%
13 | %%%%%%.% %%%%  %%%% %.%%%%%%
14 | %     .  %G  GG  G%  .     %
15 | %%%%%%.% %%%%%%%%%% %.%%%%%%
16 | %%%%%%.%            %.%%%%%%
17 | %%%%%%.% %%%%%%%%%% %.%%%%%%
18 | %............%%............%
19 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
20 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
21 | %o..%%.......  .......%%..o%
22 | %%%.%%.%%.%%%%%%%%.%%.%%.%%%
23 | %%%.%%.%%.%%%%%%%%.%%.%%.%%%
24 | %......%%....%%....%%......%
25 | %.%%%%%%%%%%.%%.%%%%%%%%%%.%
26 | %.............P............%
27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%
28 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/smallClassic.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%%%%%%%%%%%%%
2 | %......%G  G%......%
3 | %.%%...%%  %%...%%.%
4 | %.%o.%........%.o%.%
5 | %.%%.%.%%%%%%.%.%%.%
6 | %........P.........%
7 | %%%%%%%%%%%%%%%%%%%%
8 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/smallGrid.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%
2 | % P   %
3 | % %%% %
4 | % %.  %
5 | % %%% %
6 | %. G  %
7 | %%%%%%%
8 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/testClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%
 2 | % . %
 3 | %.G.%
 4 | % . %
 5 | %. .%
 6 | %   %
 7 | %  .%
 8 | %   %
 9 | %P .%
10 | %%%%%
11 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/trappedClassic.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%
2 | %   P G%
3 | %G%%%%%%
4 | %....  %
5 | %%%%%%%%
6 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/layouts/trickyClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%
 2 | %o...%........%...o%
 3 | %.%%.%.%%..%%.%.%%.%
 4 | %.%.....%..%.....%.%
 5 | %.%.%%.%%  %%.%%.%.%
 6 | %...... GGGG%.%....%
 7 | %.%....%%%%%%.%..%.%
 8 | %.%....%  oo%.%..%.%
 9 | %.%....% %%%%.%..%.%
10 | %.%...........%..%.%
11 | %.%%.%.%%%%%%.%.%%.%
12 | %o...%...P....%...o%
13 | %%%%%%%%%%%%%%%%%%%%
14 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/learningAgents.py:
--------------------------------------------------------------------------------
  1 | # learningAgents.py
  2 | # -----------------
  3 | # Licensing Information: Please do not distribute or publish solutions to this
  4 | # project. You are free to use and extend these projects for educational
  5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
  6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
  7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  8 | 
  9 | from game import Directions, Agent, Actions
 10 | 
 11 | import random,util,time
 12 | 
 13 | class ValueEstimationAgent(Agent):
 14 |   """
 15 |     Abstract agent which assigns values to (state,action)
 16 |     Q-Values for an environment. As well as a value to a
 17 |     state and a policy given respectively by,
 18 | 
 19 |     V(s) = max_{a in actions} Q(s,a)
 20 |     policy(s) = arg_max_{a in actions} Q(s,a)
 21 | 
 22 |     Both ValueIterationAgent and QLearningAgent inherit
 23 |     from this agent. While a ValueIterationAgent has
 24 |     a model of the environment via a MarkovDecisionProcess
 25 |     (see mdp.py) that is used to estimate Q-Values before
 26 |     ever actually acting, the QLearningAgent estimates
 27 |     Q-Values while acting in the environment.
 28 |   """
 29 | 
 30 |   def __init__(self, alpha=1.0, epsilon=0.05, gamma=0.8, numTraining = 10):
 31 |     """
 32 |     Sets options, which can be passed in via the Pacman command line using -a alpha=0.5,...
 33 |     alpha    - learning rate
 34 |     epsilon  - exploration rate
 35 |     gamma    - discount factor
 36 |     numTraining - number of training episodes, i.e. no learning after these many episodes
 37 |     """
 38 |     self.alpha = float(alpha)
 39 |     self.epsilon = float(epsilon)
 40 |     self.discount = float(gamma)
 41 |     self.numTraining = int(numTraining)
 42 | 
 43 |   ####################################
 44 |   #    Override These Functions      #
 45 |   ####################################
 46 |   def getQValue(self, state, action):
 47 |     """
 48 |     Should return Q(state,action)
 49 |     """
 50 |     util.raiseNotDefined()
 51 | 
 52 |   def getValue(self, state):
 53 |     """
 54 |     What is the value of this state under the best action?
 55 |     Concretely, this is given by
 56 | 
 57 |     V(s) = max_{a in actions} Q(s,a)
 58 |     """
 59 |     util.raiseNotDefined()
 60 | 
 61 |   def getPolicy(self, state):
 62 |     """
 63 |     What is the best action to take in the state. Note that because
 64 |     we might want to explore, this might not coincide with getAction
 65 |     Concretely, this is given by
 66 | 
 67 |     policy(s) = arg_max_{a in actions} Q(s,a)
 68 | 
 69 |     If many actions achieve the maximal Q-value,
 70 |     it doesn't matter which is selected.
 71 |     """
 72 |     util.raiseNotDefined()
 73 | 
 74 |   def getAction(self, state):
 75 |     """
 76 |     state: can call state.getLegalActions()
 77 |     Choose an action and return it.
 78 |     """
 79 |     util.raiseNotDefined()
 80 | 
 81 | class ReinforcementAgent(ValueEstimationAgent):
 82 |   """
 83 |     Abstract Reinforcemnt Agent: A ValueEstimationAgent
 84 | 	  which estimates Q-Values (as well as policies) from experience
 85 | 	  rather than a model
 86 | 
 87 |       What you need to know:
 88 | 		  - The environment will call
 89 | 		    observeTransition(state,action,nextState,deltaReward),
 90 | 		    which will call update(state, action, nextState, deltaReward)
 91 | 		    which you should override.
 92 |       - Use self.getLegalActions(state) to know which actions
 93 | 		    are available in a state
 94 |   """
 95 |   ####################################
 96 |   #    Override These Functions      #
 97 |   ####################################
 98 | 
 99 |   def update(self, state, action, nextState, reward):
100 |     """
101 | 	    This class will call this function, which you write, after
102 | 	    observing a transition and reward
103 |     """
104 |     util.raiseNotDefined()
105 | 
106 |   ####################################
107 |   #    Read These Functions          #
108 |   ####################################
109 | 
110 |   def getLegalActions(self,state):
111 |     """
112 |       Get the actions available for a given
113 |       state. This is what you should use to
114 |       obtain legal actions for a state
115 |     """
116 |     return self.actionFn(state)
117 | 
118 |   def observeTransition(self, state,action,nextState,deltaReward):
119 |     """
120 |     	Called by environment to inform agent that a transition has
121 |     	been observed. This will result in a call to self.update
122 |     	on the same arguments
123 | 
124 |     	NOTE: Do *not* override or call this function
125 |     """
126 |     self.episodeRewards += deltaReward
127 |     self.update(state,action,nextState,deltaReward)
128 | 
129 |   def startEpisode(self):
130 |     """
131 |       Called by environment when new episode is starting
132 |     """
133 |     self.lastState = None
134 |     self.lastAction = None
135 |     self.episodeRewards = 0.0
136 | 
137 |   def stopEpisode(self):
138 |     """
139 |       Called by environment when episode is done
140 |     """
141 |     if self.episodesSoFar < self.numTraining:
142 | 		  self.accumTrainRewards += self.episodeRewards
143 |     else:
144 | 		  self.accumTestRewards += self.episodeRewards
145 |     self.episodesSoFar += 1
146 |     if self.episodesSoFar >= self.numTraining:
147 |       # Take off the training wheels
148 |       self.epsilon = 0.0    # no exploration
149 |       self.alpha = 0.0      # no learning
150 | 
151 |   def isInTraining(self):
152 |       return self.episodesSoFar < self.numTraining
153 | 
154 |   def isInTesting(self):
155 |       return not self.isInTraining()
156 | 
157 |   def __init__(self, actionFn = None, numTraining=100, epsilon=0.5, alpha=0.5, gamma=1):
158 |     """
159 |     actionFn: Function which takes a state and returns the list of legal actions
160 | 
161 |     alpha    - learning rate
162 |     epsilon  - exploration rate
163 |     gamma    - discount factor
164 |     numTraining - number of training episodes, i.e. no learning after these many episodes
165 |     """
166 |     if actionFn == None:
167 |         actionFn = lambda state: state.getLegalActions()
168 |     self.actionFn = actionFn
169 |     self.episodesSoFar = 0
170 |     self.accumTrainRewards = 0.0
171 |     self.accumTestRewards = 0.0
172 |     self.numTraining = int(numTraining)
173 |     self.epsilon = float(epsilon)
174 |     self.alpha = float(alpha)
175 |     self.discount = float(gamma)
176 | 
177 |   ################################
178 |   # Controls needed for Crawler  #
179 |   ################################
180 |   def setEpsilon(self, epsilon):
181 |     self.epsilon = epsilon
182 | 
183 |   def setLearningRate(self, alpha):
184 |     self.alpha = alpha
185 | 
186 |   def setDiscount(self, discount):
187 |     self.discount = discount
188 | 
189 |   def doAction(self,state,action):
190 |     """
191 |         Called by inherited class when
192 |         an action is taken in a state
193 |     """
194 |     self.lastState = state
195 |     self.lastAction = action
196 | 
197 |   ###################
198 |   # Pacman Specific #
199 |   ###################
200 |   def observationFunction(self, state):
201 |     """
202 |         This is where we ended up after our last action.
203 |         The simulation should somehow ensure this is called
204 |     """
205 |     if not self.lastState is None:
206 |         reward = state.getScore() - self.lastState.getScore()
207 |         self.observeTransition(self.lastState, self.lastAction, state, reward)
208 |     return state
209 | 
210 |   def registerInitialState(self, state):
211 |     self.startEpisode()
212 |     if self.episodesSoFar == 0:
213 |         print 'Beginning %d episodes of Training' % (self.numTraining)
214 | 
215 |   def final(self, state):
216 |     """
217 |       Called by Pacman game at the terminal state
218 |     """
219 |     deltaReward = state.getScore() - self.lastState.getScore()
220 |     self.observeTransition(self.lastState, self.lastAction, state, deltaReward)
221 |     self.stopEpisode()
222 | 
223 |     # Make sure we have this var
224 |     if not 'episodeStartTime' in self.__dict__:
225 |         self.episodeStartTime = time.time()
226 |     if not 'lastWindowAccumRewards' in self.__dict__:
227 |         self.lastWindowAccumRewards = 0.0
228 |     self.lastWindowAccumRewards += state.getScore()
229 | 
230 |     NUM_EPS_UPDATE = 100
231 |     if self.episodesSoFar % NUM_EPS_UPDATE == 0:
232 |         print 'Reinforcement Learning Status:'
233 |         windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE)
234 |         if self.episodesSoFar <= self.numTraining:
235 |             trainAvg = self.accumTrainRewards / float(self.episodesSoFar)
236 |             print '\tCompleted %d out of %d training episodes' % (
237 |                    self.episodesSoFar,self.numTraining)
238 |             print '\tAverage Rewards over all training: %.2f' % (
239 |                     trainAvg)
240 |         else:
241 |             testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining)
242 |             print '\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining)
243 |             print '\tAverage Rewards over testing: %.2f' % testAvg
244 |         print '\tAverage Rewards for last %d episodes: %.2f'  % (
245 |                 NUM_EPS_UPDATE,windowAvg)
246 |         print '\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime)
247 |         self.lastWindowAccumRewards = 0.0
248 |         self.episodeStartTime = time.time()
249 | 
250 |     if self.episodesSoFar == self.numTraining:
251 |         msg = 'Training Done (turning off epsilon and alpha)'
252 |         print '%s\n%s' % (msg,'-' * len(msg))
253 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/mdp.py:
--------------------------------------------------------------------------------
 1 | # mdp.py
 2 | # ------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | import random
10 | 
11 | class MarkovDecisionProcess:
12 |     
13 |   def getStates(self):
14 |     """
15 |     Return a list of all states in the MDP.
16 |     Not generally possible for large MDPs.
17 |     """
18 |     abstract
19 |         
20 |   def getStartState(self):
21 |     """
22 |     Return the start state of the MDP.
23 |     """
24 |     abstract
25 |     
26 |   def getPossibleActions(self, state):
27 |     """
28 |     Return list of possible actions from 'state'.
29 |     """
30 |     abstract
31 |         
32 |   def getTransitionStatesAndProbs(self, state, action):
33 |     """
34 |     Returns list of (nextState, prob) pairs
35 |     representing the states reachable
36 |     from 'state' by taking 'action' along
37 |     with their transition probabilities.  
38 |     
39 |     Note that in Q-Learning and reinforcment
40 |     learning in general, we do not know these
41 |     probabilities nor do we directly model them.
42 |     """
43 |     abstract
44 |         
45 |   def getReward(self, state, action, nextState):
46 |     """
47 |     Get the reward for the state, action, nextState transition.
48 |     
49 |     Not available in reinforcement learning.
50 |     """
51 |     abstract
52 | 
53 |   def isTerminal(self, state):
54 |     """
55 |     Returns true if the current state is a terminal state.  By convention,
56 |     a terminal state has zero future rewards.  Sometimes the terminal state(s)
57 |     may have no possible actions.  It is also common to think of the terminal
58 |     state as having a self-loop action 'pass' with zero reward; the formulations
59 |     are equivalent.
60 |     """
61 |     abstract
62 | 
63 |     
64 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/pacmanAgents.py:
--------------------------------------------------------------------------------
 1 | # pacmanAgents.py
 2 | # ---------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | from pacman import Directions
10 | from game import Agent
11 | import random
12 | import game
13 | import util
14 | 
15 | class LeftTurnAgent(game.Agent):
16 |   "An agent that turns left at every opportunity"
17 |   
18 |   def getAction(self, state):
19 |     legal = state.getLegalPacmanActions()
20 |     current = state.getPacmanState().configuration.direction
21 |     if current == Directions.STOP: current = Directions.NORTH
22 |     left = Directions.LEFT[current]
23 |     if left in legal: return left
24 |     if current in legal: return current
25 |     if Directions.RIGHT[current] in legal: return Directions.RIGHT[current]
26 |     if Directions.LEFT[left] in legal: return Directions.LEFT[left]
27 |     return Directions.STOP
28 | 
29 | class GreedyAgent(Agent):
30 |   def __init__(self, evalFn="scoreEvaluation"):
31 |     self.evaluationFunction = util.lookup(evalFn, globals())
32 |     assert self.evaluationFunction != None
33 |         
34 |   def getAction(self, state):
35 |     # Generate candidate actions
36 |     legal = state.getLegalPacmanActions()
37 |     if Directions.STOP in legal: legal.remove(Directions.STOP)
38 |       
39 |     successors = [(state.generateSuccessor(0, action), action) for action in legal] 
40 |     scored = [(self.evaluationFunction(state), action) for state, action in successors]
41 |     bestScore = max(scored)[0]
42 |     bestActions = [pair[1] for pair in scored if pair[0] == bestScore]
43 |     return random.choice(bestActions)
44 |   
45 | def scoreEvaluation(state):
46 |   return state.getScore()  


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/qlearningAgents.py:
--------------------------------------------------------------------------------
  1 | # qlearningAgents.py
  2 | # ------------------
  3 | ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  4 | 
  5 | from game import *
  6 | from learningAgents import ReinforcementAgent
  7 | from featureExtractors import *
  8 | 
  9 | import random,util,math
 10 | from collections import defaultdict
 11 | 
 12 | class QLearningAgent(ReinforcementAgent):
 13 |   """
 14 |     Q-Learning Agent
 15 | 
 16 |     Instance variables you have access to
 17 |       - self.epsilon (exploration prob)
 18 |       - self.alpha (learning rate)
 19 |       - self.discount (discount rate aka gamma)
 20 | 
 21 |     Functions you should use
 22 |       - self.getLegalActions(state)
 23 |         which returns legal actions for a state
 24 |       - self.getQValue(state,action)
 25 |         which returns Q(state,action)
 26 |       - self.setQValue(state,action,value)
 27 |         which sets Q(state,action) := value
 28 |     
 29 |     !!!Important!!!
 30 |     NOTE: please avoid using self._qValues directly to make code cleaner
 31 |   """
 32 |   def __init__(self, **args):
 33 |     "We initialize agent and Q-values here."
 34 |     ReinforcementAgent.__init__(self, **args)
 35 |     self._qValues = defaultdict(lambda:defaultdict(lambda:0))
 36 |     
 37 | 
 38 |   def getQValue(self, state, action):
 39 |     """
 40 |       Returns Q(state,action)
 41 |     """
 42 |     return self._qValues[state][action]
 43 | 
 44 |   def setQValue(self,state,action,value):
 45 |     """
 46 |       Sets the Qvalue for [state,action] to the given value
 47 |     """
 48 |     self._qValues[state][action] = value
 49 | 
 50 | #---------------------#start of your code#---------------------#
 51 | 
 52 |   def getValue(self, state):
 53 |     """
 54 |       Returns max_action Q(state,action)
 55 |       where the max is over legal actions.
 56 |     """
 57 |     
 58 |     possibleActions = self.getLegalActions(state)
 59 |     #If there are no legal actions, return 0.0
 60 |     if len(possibleActions) == 0:
 61 |     	return 0.0
 62 | 
 63 |     "*** YOUR CODE HERE ***"
 64 |     raise NotImplementedError
 65 | 
 66 |     return 0.
 67 |     
 68 |   def getPolicy(self, state):
 69 |     """
 70 |       Compute the best action to take in a state. 
 71 |       
 72 |     """
 73 |     possibleActions = self.getLegalActions(state)
 74 | 
 75 |     #If there are no legal actions, return None
 76 |     if len(possibleActions) == 0:
 77 |     	return None
 78 |     
 79 |     best_action = None
 80 | 
 81 |     "*** YOUR CODE HERE ***"
 82 |     raise NotImplementedError
 83 | 
 84 |     return best_action
 85 | 
 86 |   def getAction(self, state):
 87 |     """
 88 |       Compute the action to take in the current state, including exploration.  
 89 |       
 90 |       With probability self.epsilon, we should take a random action.
 91 |       otherwise - the best policy action (self.getPolicy).
 92 | 
 93 |       HINT: You might want to use util.flipCoin(prob)
 94 |       HINT: To pick randomly from a list, use random.choice(list)
 95 | 
 96 |     """
 97 |     
 98 |     # Pick Action
 99 |     possibleActions = self.getLegalActions(state)
100 |     action = None
101 |     
102 |     #If there are no legal actions, return None
103 |     if len(possibleActions) == 0:
104 |     	return None
105 | 
106 |     #agent parameters:
107 |     epsilon = self.epsilon
108 | 
109 |     "*** YOUR CODE HERE ***"
110 |     raise NotImplementedError    
111 | 
112 |     return action
113 | 
114 |   def update(self, state, action, nextState, reward):
115 |     """
116 |       You should do your Q-Value update here
117 | 
118 |       NOTE: You should never call this function,
119 |       it will be called on your behalf
120 | 
121 | 
122 |     """
123 |     #agent parameters
124 |     gamma = self.discount
125 |     learning_rate = self.alpha
126 |     
127 |     "*** YOUR CODE HERE ***"
128 |     raise NotImplementedError
129 |     
130 |     reference_qvalue = PleaseImplementMe
131 |     updated_qvalue = PleaseImplementMe
132 | 
133 |     self.setQValue(PleaseImplementMe,PleaseImplementMe,updated_qvalue)
134 | 
135 | 
136 | #---------------------#end of your code#---------------------#
137 | 
138 | 
139 | 
140 | class PacmanQAgent(QLearningAgent):
141 |   "Exactly the same as QLearningAgent, but with different default parameters"
142 | 
143 |   def __init__(self, epsilon=0.05,gamma=0.8,alpha=0.2, numTraining=0, **args):
144 |     """
145 |     These default parameters can be changed from the pacman.py command line.
146 |     For example, to change the exploration rate, try:
147 |         python pacman.py -p PacmanQLearningAgent -a epsilon=0.1
148 | 
149 |     alpha    - learning rate
150 |     epsilon  - exploration rate
151 |     gamma    - discount factor
152 |     numTraining - number of training episodes, i.e. no learning after these many episodes
153 |     """
154 |     args['epsilon'] = epsilon
155 |     args['gamma'] = gamma
156 |     args['alpha'] = alpha
157 |     args['numTraining'] = numTraining
158 |     self.index = 0  # This is always Pacman
159 |     QLearningAgent.__init__(self, **args)
160 | 
161 |   def getAction(self, state):
162 |     """
163 |     Simply calls the getAction method of QLearningAgent and then
164 |     informs parent of action for Pacman.  Do not change or remove this
165 |     method.
166 |     """
167 |     action = QLearningAgent.getAction(self,state)
168 |     self.doAction(state,action)
169 |     return action
170 | 
171 | 
172 | 
173 | class ApproximateQAgent(PacmanQAgent):
174 |     pass
175 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/run_crawler.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python crawler.py
3 | 
4 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/run_grid.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python gridworld.py -a q -k 100 -n 0 -g BookGrid -e 0.5
3 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/run_pacman.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python pacman.py -p PacmanQAgent -x 1000 -n 1010 -l smallGrid
3 | 
4 | # -x is the amount of training epochs, -n is the total amount of epochs.
5 | # hence, if you want to spend 1337 epochs training and then play 42 for evalution, you will need
6 | # python pacman.py -p PacmanQAgent -x 1337 -n 1379 -l smallGrid
7 | 


--------------------------------------------------------------------------------
/week3_model_free/seminar_main/textDisplay.py:
--------------------------------------------------------------------------------
 1 | # textDisplay.py
 2 | # --------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | import pacman, time
10 | 
11 | DRAW_EVERY = 1
12 | SLEEP_TIME = 0 # This can be overwritten by __init__
13 | DISPLAY_MOVES = False
14 | QUIET = False # Supresses output
15 | 
16 | class NullGraphics:
17 |   def initialize(self, state, isBlue = False):
18 |     pass
19 |   
20 |   def update(self, state):
21 |     pass
22 |   
23 |   def pause(self):
24 |     time.sleep(SLEEP_TIME)
25 |     
26 |   def draw(self, state):
27 |     print state
28 |   
29 |   def finish(self):
30 |     pass
31 | 
32 | class PacmanGraphics:
33 |   def __init__(self, speed=None):
34 |     if speed != None:
35 |       global SLEEP_TIME
36 |       SLEEP_TIME = speed
37 |   
38 |   def initialize(self, state, isBlue = False):
39 |     self.draw(state)
40 |     self.pause()
41 |     self.turn = 0
42 |     self.agentCounter = 0
43 |     
44 |   def update(self, state):
45 |     numAgents = len(state.agentStates)
46 |     self.agentCounter = (self.agentCounter + 1) % numAgents
47 |     if self.agentCounter == 0:
48 |       self.turn += 1
49 |       if DISPLAY_MOVES:
50 |         ghosts = [pacman.nearestPoint(state.getGhostPosition(i)) for i in range(1, numAgents)]
51 |         print "%4d) P: %-8s" % (self.turn, str(pacman.nearestPoint(state.getPacmanPosition()))),'| Score: %-5d' % state.score,'| Ghosts:', ghosts
52 |       if self.turn % DRAW_EVERY == 0:
53 |         self.draw(state)
54 |         self.pause()
55 |     if state._win or state._lose:
56 |       self.draw(state)
57 |     
58 |   def pause(self):
59 |     time.sleep(SLEEP_TIME)
60 |     
61 |   def draw(self, state):
62 |     print state
63 |   
64 |   def finish(self):
65 |     pass
66 | 


--------------------------------------------------------------------------------
/week4_[recap]_deep_learning/README.md:
--------------------------------------------------------------------------------
 1 | __Note:__ This week's materials cover the basics of neural nets and deep learning and teach you how to use auto-diff frameworks. If you're already fluent in tensorflow OR pytorch OR theano - feel free to skip this week entirely..
 2 | 
 3 | ## Materials
 4 | * [__Lecture slides__](https://yadi.sk/i/yAO2AJ3M3EKP8g)
 5 | 
 6 | - __In russian:__
 7 |   * Basic lecture on deep learning - [video](https://yadi.sk/i/yyHZub6R3Ej5dV)
 8 |   * Deep learning frameworks - [video](https://yadi.sk/i/hDIkaR4H3EtnXM) 
 9 |   * [Pytorch tutorial](https://yadi.sk/i/O3mQ76u43So3h9) __recommended__
10 |   * [Tensorflow tutorial](https://www.youtube.com/watch?v=FQ660T4uu7k) (english only for now. Links are welcome)
11 |   * [Theano tutorial](https://yadi.sk/i/54STsEBVpubkn)
12 | 
13 | - __In english:__
14 |   * Intro to neural nets and backprop (english) - [video](https://www.youtube.com/watch?v=uXt8qF2Zzfo)
15 |   * Intro to convnets - [video](https://www.youtube.com/watch?v=FmpDIaiMIeA)
16 |   * Deep learning frameworks - [video](https://www.youtube.com/watch?v=Vf_-OkqbwPo)
17 |   * [Tensorflow tutorial](https://www.youtube.com/watch?v=FQ660T4uu7k)
18 |   * [Theano tutorial](https://www.youtube.com/watch?v=OU8I1oJ9HhI)
19 |   * [Pytorch tutorial](https://www.youtube.com/watch?v=VMcRWYEKmhw)
20 | 
21 | ## Bonus materials
22 | * Karpathy's course on deep learning (english) - http://cs231n.github.io/
23 | * A neat little play-ground where you can train small NNs and see what they actually learn - [playground](http://playground.tensorflow.org/)
24 | * Nuts and Bolts of deep learning by Andrew Ng (english) - [video](https://www.youtube.com/watch?v=F1ka6a13S9I)
25 | * Deep learning philosophy: [our humble take](https://www.youtube.com/watch?v=9qyE1Ev1Xdw) (english)
26 | * Deep learning demystified - [video](https://www.youtube.com/watch?v=Q9Z20HCPnww)
27 | * Karpathy's lecture on deep learning for computer vision - https://www.youtube.com/watch?v=u6aEYuemt0M
28 | * Our humble DL course: [HSE'fall17](https://github.com/yandexdataschool/HSE_deeplearning), [Skoltech/YSDA'spring16](https://github.com/ddtm/dl-course/) courses on deep learning (english).
29 | * Srsly, just google `"deep learning %s"%s for s in what_you_want_to_know`.
30 | 
31 |   
32 | ### Practice
33 | From now on, we'll have two tracks: theano and tensorflow. We'll also add pytorch seminars as soon as they're ready.
34 | 
35 | Please pick seminar_theano.ipynb, seminar_tensorflow.ipynb or seminar_pytorch.ipynb.
36 | 
37 | __Note:__ in this and all following weeks you're only required to get through practice in _one_ of the frameworks. Looking into other alternatives is great for self-education but never mandatory.
38 | 
39 | #### What to choose?
40 | * The simplest choice is PyTorch: it's basically ye olde numpy with automatic gradients and a lot of pre-implemented DL stuff... except all the functions have different names.
41 | * If you want to be familiar with production-related stuff from day 1, choose TensorFlow. It's much more convenient to deploy (to non-python or to mobiles). The catch is that all those conveniences become inconveniences once you want to write something simple in jupyter.
42 | * Theano works like tensorflow but it offers a numpy-compatible interface and comes with built-in graph optimization. The payoff is that theano is not as popular as the first two. It is also not meant as a producton framework so deploying to mobiles may be a problem.
43 | 
44 | * It's not like choosing house at Hogwarts, you'll be able to switch between frameworks easily once you master the underlying principles.
45 |   
46 | 


--------------------------------------------------------------------------------
/week4_[recap]_deep_learning/fix_my_nn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from lasagne.layers import *\n",
 12 |     "from lasagne.nonlinearities import *\n",
 13 |     "from lasagne import init"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {
 20 |     "collapsed": false
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "nn = InputLayer([None,3,100,100])\n",
 25 |     "\n",
 26 |     "nn = Conv2DLayer(nn,num_filters=512, filter_size=(3,3),\n",
 27 |     "                 W = init.Constant(0))\n",
 28 |     "\n",
 29 |     "nn = Conv2DLayer(nn,num_filters=128,filter_size=(3,3),\n",
 30 |     "                 W = init.Constant(0))\n",
 31 |     "\n",
 32 |     "nn = Conv2DLayer(nn,num_filters=32,filter_size=(3,3),\n",
 33 |     "                 W = init.Constant(0))\n",
 34 |     "\n",
 35 |     "nn = Pool2DLayer(nn,pool_size=(6,6),mode='max')\n",
 36 |     "\n",
 37 |     "nn = Conv2DLayer(nn,num_filters=8,filter_size=(10,10),\n",
 38 |     "                W = init.Normal(std=0.01))\n",
 39 |     "\n",
 40 |     "nn = Conv2DLayer(nn,num_filters=8,filter_size=(10,10),\n",
 41 |     "                W = init.Normal(std=0.01))\n",
 42 |     "\n",
 43 |     "nn = Pool2DLayer(nn,pool_size=(3,3),mode='max')\n",
 44 |     "\n",
 45 |     "nn = DenseLayer(nn,512,nonlinearity=softmax)\n",
 46 |     "\n",
 47 |     "nn = DropoutLayer(nn,p=0.5)\n",
 48 |     "\n",
 49 |     "nn = DenseLayer(nn,512,nonlinearity=softmax)\n",
 50 |     "\n",
 51 |     "nn = DenseLayer(nn,10,nonlinearity=sigmoid)\n",
 52 |     "\n",
 53 |     "nn = DropoutLayer(nn,p=0.5)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "```\n",
 61 |     "\n",
 62 |     "```\n",
 63 |     "\n",
 64 |     "```\n",
 65 |     "\n",
 66 |     "```\n",
 67 |     "\n",
 68 |     "```\n",
 69 |     "\n",
 70 |     "```\n",
 71 |     "\n",
 72 |     "```\n",
 73 |     "\n",
 74 |     "```\n",
 75 |     "\n",
 76 |     "```\n",
 77 |     "\n",
 78 |     "```\n",
 79 |     "\n",
 80 |     "```\n",
 81 |     "\n",
 82 |     "```\n",
 83 |     "\n",
 84 |     "```\n",
 85 |     "\n",
 86 |     "```\n",
 87 |     "\n",
 88 |     "```\n",
 89 |     "\n",
 90 |     "```\n",
 91 |     "\n",
 92 |     "```\n",
 93 |     "\n",
 94 |     "```\n",
 95 |     "\n",
 96 |     "```\n",
 97 |     "\n",
 98 |     "```\n",
 99 |     "\n",
100 |     "```\n",
101 |     "\n",
102 |     "```\n",
103 |     "\n",
104 |     "```\n",
105 |     "\n",
106 |     "```\n",
107 |     "\n",
108 |     "```\n",
109 |     "\n",
110 |     "```\n",
111 |     "\n",
112 |     "```\n",
113 |     "\n",
114 |     "```\n",
115 |     "\n",
116 |     "```\n",
117 |     "\n",
118 |     "```\n",
119 |     "\n",
120 |     "\n",
121 |     "# Book of grudges\n",
122 |     "* zero init for weights will cause symmetry effect\n",
123 |     "* Too many filters for first 3x3 convolution - will lead to enormous matrix while there's just not enough relevant combinations of 3x3 images (overkill).\n",
124 |     "* Usually the further you go, the more filters you need.\n",
125 |     "* large filters (10x10 is generally a bad pactice, and you definitely need more than 10 of them\n",
126 |     "* the second of 10x10 convolution gets 8x6x6 image as input, so it's technically unable to perform such convolution.\n",
127 |     "* Softmax nonlinearity effectively makes only 1 or a few neurons from the entire layer to \"fire\", rendering 512-neuron layer almost useless. Softmax at the output layer is okay though\n",
128 |     "* Dropout after probability prediciton is just lame. A few random classes get probability of 0, so your probabilities no longer sum to 1 and crossentropy goes -inf."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": []
139 |   }
140 |  ],
141 |  "metadata": {
142 |   "kernelspec": {
143 |    "display_name": "Python [Root]",
144 |    "language": "python",
145 |    "name": "Python [Root]"
146 |   },
147 |   "language_info": {
148 |    "codemirror_mode": {
149 |     "name": "ipython",
150 |     "version": 2
151 |    },
152 |    "file_extension": ".py",
153 |    "mimetype": "text/x-python",
154 |    "name": "python",
155 |    "nbconvert_exporter": "python",
156 |    "pygments_lexer": "ipython2",
157 |    "version": "2.7.12"
158 |   }
159 |  },
160 |  "nbformat": 4,
161 |  "nbformat_minor": 0
162 | }
163 | 


--------------------------------------------------------------------------------
/week4_[recap]_deep_learning/mnist.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import time
 4 | 
 5 | import numpy as np
 6 | 
 7 | __doc__="""taken from https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py"""
 8 | 
 9 | def load_dataset():
10 |     # We first define a download function, supporting both Python 2 and 3.
11 |     if sys.version_info[0] == 2:
12 |         from urllib import urlretrieve
13 |     else:
14 |         from urllib.request import urlretrieve
15 | 
16 |     def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
17 |         print("Downloading %s" % filename)
18 |         urlretrieve(source + filename, filename)
19 | 
20 |     # We then define functions for loading MNIST images and labels.
21 |     # For convenience, they also download the requested files if needed.
22 |     import gzip
23 | 
24 |     def load_mnist_images(filename):
25 |         if not os.path.exists(filename):
26 |             download(filename)
27 |         # Read the inputs in Yann LeCun's binary format.
28 |         with gzip.open(filename, 'rb') as f:
29 |             data = np.frombuffer(f.read(), np.uint8, offset=16)
30 |         # The inputs are vectors now, we reshape them to monochrome 2D images,
31 |         # following the shape convention: (examples, channels, rows, columns)
32 |         data = data.reshape(-1, 1, 28, 28)
33 |         # The inputs come as bytes, we convert them to float32 in range [0,1].
34 |         # (Actually to range [0, 255/256], for compatibility to the version
35 |         # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
36 |         return data / np.float32(256)
37 | 
38 |     def load_mnist_labels(filename):
39 |         if not os.path.exists(filename):
40 |             download(filename)
41 |         # Read the labels in Yann LeCun's binary format.
42 |         with gzip.open(filename, 'rb') as f:
43 |             data = np.frombuffer(f.read(), np.uint8, offset=8)
44 |         # The labels are vectors of integers now, that's exactly what we want.
45 |         return data
46 | 
47 |     # We can now download and read the training and test set images and labels.
48 |     X_train = load_mnist_images('train-images-idx3-ubyte.gz')
49 |     y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
50 |     X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
51 |     y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
52 | 
53 |     # We reserve the last 10000 training examples for validation.
54 |     X_train, X_val = X_train[:-10000], X_train[-10000:]
55 |     y_train, y_val = y_train[:-10000], y_train[-10000:]
56 | 
57 |     # We just return all the arrays in order, as expected in main().
58 |     # (It doesn't matter how we do this as long as we can read them again.)
59 |     return X_train, y_train, X_val, y_val, X_test, y_test
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/week4_[recap]_deep_learning/notmnist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from scipy.misc import imread,imresize
 4 | from sklearn.model_selection import train_test_split
 5 | from glob import glob
 6 | 
 7 | def load_notmnist(path='./notMNIST_small',letters='ABCDEFGHIJ',
 8 |                   img_shape=(28,28),test_size=0.25,one_hot=False):
 9 |     
10 |     # download data if it's missing. If you have any problems, go to the urls and load it manually.
11 |     if not os.path.exists(path):
12 |         print("Downloading data...")
13 |         assert os.system('curl http://yaroslavvb.com/upload/notMNIST/notMNIST_small.tar.gz > notMNIST_small.tar.gz') == 0
14 |         print("Extracting ...")
15 |         assert os.system('tar -zxvf notMNIST_small.tar.gz > untar_notmnist.log') == 0
16 |     
17 |     data,labels = [],[]
18 |     print("Parsing...")
19 |     for img_path in glob(os.path.join(path,'*/*')):
20 |         class_i = img_path.split(os.sep)[-2]
21 |         if class_i not in letters: continue
22 |         try:
23 |             data.append(imresize(imread(img_path), img_shape))
24 |             labels.append(class_i,)
25 |         except:
26 |             print("found broken img: %s [it's ok if <10 images are broken]" % img_path)
27 |         
28 |     data = np.stack(data)[:,None].astype('float32')
29 |     data = (data - np.mean(data)) / np.std(data)
30 | 
31 |     #convert classes to ints
32 |     letter_to_i = {l:i for i,l in enumerate(letters)}
33 |     labels = np.array(list(map(letter_to_i.get, labels)))
34 |     
35 |     if one_hot:
36 |         labels = (np.arange(np.max(labels) + 1)[None,:] == labels[:, None]).astype('float32')
37 |     
38 |     #split into train/test
39 |     X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=test_size, random_state=42)
40 |     
41 |     print("Done")
42 |     return X_train, y_train, X_test, y_test
43 | 
44 | 


--------------------------------------------------------------------------------
/week4_approx_rl/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [__lecture slides I__](https://yadi.sk/i/kGPiXpse3NR3n8), [__slides II__](https://yadi.sk/i/H07O_XEh3NR3oV)
 3 | * Our [lecture](https://yadi.sk/i/AHDU2p_j3FT3nr), [second lecture](https://yadi.sk/i/yBO0q4mI3GAxYd), [seminar](https://yadi.sk/i/EeUeheri3FT3ra) (russian)
 4 | 
 5 | 
 6 | * David Silver lecture - [video](https://www.youtube.com/watch?v=UoPei5o4fps)
 7 | * More practical and less theoretical lecture from MIT 6.S191 - [video](https://www.youtube.com/watch?v=xWe58WGWmlk)
 8 | * Understanding approximate q-learning - [url](https://danieltakeshi.github.io/2016/10/31/going-deeper-into-reinforcement-learning-understanding-q-learning-and-linear-function-approximation/)
 9 | * Karpathy's post on approximate RL - [url](http://karpathy.github.io/2016/05/31/rl/)
10 | 
11 | ## More materials
12 | * __[recommended]__ How to _actually_ do deep reinforcement learning by J. Schulman - [pdf](http://rll.berkeley.edu/deeprlcourse/docs/nuts-and-bolts.pdf)
13 | * __[recommended]__ An overview of deep reinforcement learning - [arxiv](https://arxiv.org/pdf/1701.07274v1.pdf)
14 | * DQN and modiffications - lecture by J. Schulman - [video](https://www.youtube.com/watch?v=h1-pj4Y9-kM)
15 | * * interactive demos in your browser: [demo1](http://cs.stanford.edu/people/karpathy/convnetjs/demo/rldemo.html)(karpathy), [demo2](http://janhuenermann.com/projects/learning-to-drive)(Hünermann)
16 | * Reinforcement learning architectures list - [repo](https://github.com/5vision/deep-reinforcement-learning-networks)
17 | * Article on dueling DQN - [arxiv](https://arxiv.org/pdf/1511.06581.pdf)
18 | * Article on double DQN - [arxiv](https://arxiv.org/abs/1509.06461)
19 | * Article on prioritized experience replay - [arxiv](https://arxiv.org/abs/1511.05952)
20 | * Article on bootstrap DQN - [pdf](https://papers.nips.cc/paper/6501-deep-exploration-via-bootstrapped-dqn.pdf), [summary](http://pemami4911.github.io/paper-summaries/2016/08/16/Deep-exploration.html)
21 | * Article on asynchronuous methods in deep RL - [arxiv](https://arxiv.org/abs/1602.01783)
22 | * Successor representations for reinforcement learning - [article](https://arxiv.org/abs/1606.02396), [video](https://www.youtube.com/watch?v=kNqXCn7K-BM&feature=youtu.be)
23 | * Video on asynchronuous methods (Mnih) - [video](https://www.youtube.com/watch?v=9sx1_u2qVhQ)
24 | 
25 | ## DQN tutorials
26 | * [in pytorch] A great series starting from simple DQN to all the cool new stuff - [url](https://github.com/higgsfield/RL-Adventure)
27 | * A guide to deep RL from ~scratch (nervana blog) - [url](https://www.nervanasys.com/demystifying-deep-reinforcement-learning/)
28 | * Building deep q-network from ~scratch (blog) - [url](https://jaromiru.com/2016/09/27/lets-make-a-dqn-theory/)
29 | * Another guide guide to DQN from ~scratch (blog) - [url](https://rubenfiszel.github.io/posts/rl4j/2016-08-24-Reinforcement-Learning-and-DQN.html)
30 | 
31 | 
32 | ## Practice
33 | 
34 | From now on, we have two tracks, theano and tensorflow. We'll also add pytorch support soon.
35 | 
36 | You can choose whichever track you want, but unless you're expertly familiar with your framework, we recommend you to start by completing the task in lasagne and only then reproduce your solution in your chosen framework.
37 | 
38 | Begin with `seminar_<framework>.ipynb` and then proceed with `homework_<framework>.ipynb`.
39 | 
40 | __Note: you're not required to submit assignments in all three frameworks. Pick one and go with it. Maybe switch it occasionally if you want more challenge. __
41 | 


--------------------------------------------------------------------------------
/week4_approx_rl/framebuffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym.spaces.box import Box
 3 | from gym.core import Wrapper
 4 | class FrameBuffer(Wrapper):
 5 |     def __init__(self, env, n_frames=4, dim_order='tensorflow'):
 6 |         """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
 7 |         super(FrameBuffer, self).__init__(env)
 8 |         self.dim_order = dim_order
 9 |         if dim_order == 'tensorflow':
10 |             height, width, n_channels = env.observation_space.shape
11 |             obs_shape = [height, width, n_channels * n_frames]
12 |         elif dim_order == 'pytorch':
13 |             n_channels, height, width = env.observation_space.shape
14 |             obs_shape = [n_channels * n_frames, height, width]
15 |         else:
16 |             raise ValueError('dim_order should be "tensorflow" or "pytorch", got {}'.format(dim_order))
17 |         self.observation_space = Box(0.0, 1.0, obs_shape)
18 |         self.framebuffer = np.zeros(obs_shape, 'float32')
19 |         
20 |     def reset(self):
21 |         """resets breakout, returns initial frames"""
22 |         self.framebuffer = np.zeros_like(self.framebuffer)
23 |         self.update_buffer(self.env.reset())
24 |         return self.framebuffer
25 |     
26 |     def step(self, action):
27 |         """plays breakout for 1 step, returns frame buffer"""
28 |         new_img, reward, done, info = self.env.step(action)
29 |         self.update_buffer(new_img)
30 |         return self.framebuffer, reward, done, info
31 |     
32 |     def update_buffer(self, img):
33 |         if self.dim_order == 'tensorflow':
34 |             offset = self.env.observation_space.shape[-1]
35 |             axis = -1
36 |             cropped_framebuffer = self.framebuffer[:,:,:-offset]
37 |         elif self.dim_order == 'pytorch':
38 |             offset = self.env.observation_space.shape[0]
39 |             axis = 0
40 |             cropped_framebuffer = self.framebuffer[:-offset]
41 |         self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)
42 | 


--------------------------------------------------------------------------------
/week4_approx_rl/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | # This code is shamelessly stolen from https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
 2 | import numpy as np
 3 | import random
 4 | 
 5 | class ReplayBuffer(object):
 6 |     def __init__(self, size):
 7 |         """Create Replay buffer.
 8 |         Parameters
 9 |         ----------
10 |         size: int
11 |             Max number of transitions to store in the buffer. When the buffer
12 |             overflows the old memories are dropped.
13 |         """
14 |         self._storage = []
15 |         self._maxsize = size
16 |         self._next_idx = 0
17 | 
18 |     def __len__(self):
19 |         return len(self._storage)
20 | 
21 |     def add(self, obs_t, action, reward, obs_tp1, done):
22 |         data = (obs_t, action, reward, obs_tp1, done)
23 | 
24 |         if self._next_idx >= len(self._storage):
25 |             self._storage.append(data)
26 |         else:
27 |             self._storage[self._next_idx] = data
28 |         self._next_idx = (self._next_idx + 1) % self._maxsize
29 | 
30 |     def _encode_sample(self, idxes):
31 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
32 |         for i in idxes:
33 |             data = self._storage[i]
34 |             obs_t, action, reward, obs_tp1, done = data
35 |             obses_t.append(np.array(obs_t, copy=False))
36 |             actions.append(np.array(action, copy=False))
37 |             rewards.append(reward)
38 |             obses_tp1.append(np.array(obs_tp1, copy=False))
39 |             dones.append(done)
40 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
41 | 
42 |     def sample(self, batch_size):
43 |         """Sample a batch of experiences.
44 |         Parameters
45 |         ----------
46 |         batch_size: int
47 |             How many transitions to sample.
48 |         Returns
49 |         -------
50 |         obs_batch: np.array
51 |             batch of observations
52 |         act_batch: np.array
53 |             batch of actions executed given obs_batch
54 |         rew_batch: np.array
55 |             rewards received as results of executing act_batch
56 |         next_obs_batch: np.array
57 |             next set of observations seen after executing act_batch
58 |         done_mask: np.array
59 |             done_mask[i] = 1 if executing act_batch[i] resulted in
60 |             the end of an episode and 0 otherwise.
61 |         """
62 |         idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
63 |         return self._encode_sample(idxes)
64 | 


--------------------------------------------------------------------------------
/week4_approx_rl/seminar_lasagne.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Approximate q-learning\n",
  8 |     "\n",
  9 |     "In this notebook you will teach a lasagne neural network to do Q-learning."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "__Frameworks__ - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {
 23 |     "collapsed": true
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "%env THEANO_FLAGS='floatX=float32'\n",
 28 |     "import os\n",
 29 |     "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\"))==0:\n",
 30 |     "    !bash ../xvfb start\n",
 31 |     "    %env DISPLAY=:1"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import gym\n",
 43 |     "import numpy as np, pandas as pd\n",
 44 |     "import matplotlib.pyplot as plt\n",
 45 |     "%matplotlib inline"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "collapsed": true,
 53 |     "scrolled": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "env = gym.make(\"CartPole-v0\").env\n",
 58 |     "env.reset()\n",
 59 |     "n_actions = env.action_space.n\n",
 60 |     "state_dim = env.observation_space.shape\n",
 61 |     "\n",
 62 |     "plt.imshow(env.render(\"rgb_array\"))"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "# Approximate (deep) Q-learning: building the network\n",
 70 |     "\n",
 71 |     "In this section we will build and train naive Q-learning with theano/lasagne"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "First step is initializing input variables"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import theano\n",
 90 |     "import theano.tensor as T\n",
 91 |     "\n",
 92 |     "#create input variables. We'll support multiple states at once\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "current_states = T.matrix(\"states[batch,units]\")\n",
 96 |     "actions = T.ivector(\"action_ids[batch]\")\n",
 97 |     "rewards = T.vector(\"rewards[batch]\")\n",
 98 |     "next_states = T.matrix(\"next states[batch,units]\")\n",
 99 |     "is_end = T.ivector(\"vector[batch] where 1 means that session just ended\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": true
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "import lasagne\n",
111 |     "from lasagne.layers import *\n",
112 |     "\n",
113 |     "#input layer\n",
114 |     "l_states = InputLayer((None,)+state_dim)\n",
115 |     "\n",
116 |     "\n",
117 |     "<Your architecture. Please start with a single-layer network>\n",
118 |     "\n",
119 |     "\n",
120 |     "#output layer\n",
121 |     "l_qvalues = DenseLayer(<previous_layer>,num_units=n_actions,nonlinearity=None)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "#### Predicting Q-values for `current_states`"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "#get q-values for ALL actions in current_states\n",
140 |     "predicted_qvalues = get_output(l_qvalues,{l_states:current_states})"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "collapsed": true
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "#compiling agent's \"GetQValues\" function\n",
152 |     "get_qvalues = <compile a function that takes current_states and returns predicted_qvalues>"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "#select q-values for chosen actions\n",
164 |     "predicted_qvalues_for_actions = predicted_qvalues[T.arange(actions.shape[0]),actions]"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "#### Loss function and `update`\n",
172 |     "Here we write a function similar to `agent.update`."
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "#predict q-values for next states\n",
184 |     "predicted_next_qvalues = get_output(l_qvalues,{l_states:<theano input with for states>})\n",
185 |     "\n",
186 |     "\n",
187 |     "#Computing target q-values under \n",
188 |     "gamma = 0.99\n",
189 |     "target_qvalues_for_actions = <target Q-values using rewards and predicted_next_qvalues>\n",
190 |     "\n",
191 |     "#zero-out q-values at the end\n",
192 |     "target_qvalues_for_actions = (1-is_end)*target_qvalues_for_actions\n",
193 |     "\n",
194 |     "#don't compute gradient over target q-values (consider constant)\n",
195 |     "target_qvalues_for_actions = theano.gradient.disconnected_grad(target_qvalues_for_actions)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "collapsed": true
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "\n",
207 |     "#mean squared error loss function\n",
208 |     "loss = <mean squared between target_qvalues_for_actions and predicted_qvalues_for_actions>\n"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "collapsed": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "#all network weights\n",
220 |     "all_weights = get_all_params(l_qvalues,trainable=True)\n",
221 |     "\n",
222 |     "#network updates. Note the small learning rate (for stability)\n",
223 |     "updates = lasagne.updates.sgd(loss,all_weights,learning_rate=1e-4)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "collapsed": true
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "#Training function that resembles agent.update(state,action,reward,next_state) \n",
235 |     "#with 1 more argument meaning is_end\n",
236 |     "train_step = theano.function([current_states,actions,rewards,next_states,is_end],\n",
237 |     "                             updates=updates)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "### Playing the game"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {
251 |     "collapsed": true
252 |    },
253 |    "outputs": [],
254 |    "source": [
255 |     "epsilon = 0.25 #initial epsilon\n",
256 |     "\n",
257 |     "def generate_session(t_max=1000):\n",
258 |     "    \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n",
259 |     "    \n",
260 |     "    total_reward = 0\n",
261 |     "    s = env.reset()\n",
262 |     "    \n",
263 |     "    for t in range(t_max):\n",
264 |     "        \n",
265 |     "        #get action q-values from the network\n",
266 |     "        q_values = get_qvalues([s])[0] \n",
267 |     "        \n",
268 |     "        a = <sample action with epsilon-greedy strategy>\n",
269 |     "        \n",
270 |     "        new_s,r,done,info = env.step(a)\n",
271 |     "        \n",
272 |     "        #train agent one step. Note that we use one-element arrays instead of scalars \n",
273 |     "        #because that's what function accepts.\n",
274 |     "        train_step([s],[a],[r],[new_s],[done])\n",
275 |     "        \n",
276 |     "        total_reward+=r\n",
277 |     "        \n",
278 |     "        s = new_s\n",
279 |     "        if done: break\n",
280 |     "            \n",
281 |     "    return total_reward\n",
282 |     "        "
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {
289 |     "collapsed": true
290 |    },
291 |    "outputs": [],
292 |    "source": [
293 |     "for i in range(100):\n",
294 |     "    \n",
295 |     "    rewards = [generate_session() for _ in range(100)] #generate new sessions\n",
296 |     "    \n",
297 |     "    epsilon*=0.95\n",
298 |     "    \n",
299 |     "    print (\"mean reward:%.3f\\tepsilon:%.5f\"%(np.mean(rewards),epsilon))\n",
300 |     "\n",
301 |     "    if np.mean(rewards) > 300:\n",
302 |     "        print (\"You Win!\")\n",
303 |     "        break\n",
304 |     "        \n",
305 |     "    assert epsilon!=0, \"Please explore environment\""
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "### Video"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {
319 |     "collapsed": true
320 |    },
321 |    "outputs": [],
322 |    "source": [
323 |     "epsilon=0 #Don't forget to reset epsilon back to initial value if you want to go on training"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {
330 |     "collapsed": true
331 |    },
332 |    "outputs": [],
333 |    "source": [
334 |     "#record sessions\n",
335 |     "import gym.wrappers\n",
336 |     "\n",
337 |     "env = gym.wrappers.Monitor(gym.make(\"CartPole-v0\"),directory=\"videos\",force=True)\n",
338 |     "sessions = [generate_session() for _ in range(100)]\n",
339 |     "env.close()"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {
346 |     "collapsed": true
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "#show video\n",
351 |     "from IPython.display import HTML\n",
352 |     "import os\n",
353 |     "\n",
354 |     "video_names = list(filter(lambda s:s.endswith(\".mp4\"),os.listdir(\"./videos/\")))\n",
355 |     "\n",
356 |     "HTML(\"\"\"\n",
357 |     "<video width=\"640\" height=\"480\" controls>\n",
358 |     "  <source src=\"{}\" type=\"video/mp4\">\n",
359 |     "</video>\n",
360 |     "\"\"\".format(\"./videos/\"+video_names[-1])) #this may or may not be _last_ video. Try other indices"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {
367 |     "collapsed": true
368 |    },
369 |    "outputs": [],
370 |    "source": []
371 |   }
372 |  ],
373 |  "metadata": {
374 |   "kernelspec": {
375 |    "display_name": "Python 3",
376 |    "language": "python",
377 |    "name": "python3"
378 |   },
379 |   "language_info": {
380 |    "codemirror_mode": {
381 |     "name": "ipython",
382 |     "version": 3
383 |    },
384 |    "file_extension": ".py",
385 |    "mimetype": "text/x-python",
386 |    "name": "python",
387 |    "nbconvert_exporter": "python",
388 |    "pygments_lexer": "ipython3",
389 |    "version": "3.6.2"
390 |   }
391 |  },
392 |  "nbformat": 4,
393 |  "nbformat_minor": 1
394 | }
395 | 


--------------------------------------------------------------------------------
/week5_explore/README.md:
--------------------------------------------------------------------------------
 1 | ### Slides - [here](https://yadi.sk/i/H0zVBROe3TWWHz)
 2 | 
 3 | ## Exploration and exploitation
 4 | * [__main__] David Silver lecture on exploration and expoitation - [video](https://www.youtube.com/watch?v=sGuiWX07sKw)
 5 | * Alternative lecture by J. Schulman - [video](https://www.youtube.com/watch?v=SfCa1HQMkuw)
 6 | * Alternative lecture by N. de Freitas (with bayesian opt) - [video](https://www.youtube.com/watch?v=vz3D36VXefI)
 7 | * Our lectures (russian) 
 8 |   - "mathematical" lecture (by Alexander Vorobev) '17 - [slides](https://yadi.sk/i/JAeItALT3JmvCL), [video](https://yadi.sk/i/bVHmu9gt3Hi9Ym)
 9 |   - "engineering" lecture '18 - [video](https://yadi.sk/i/_myWJ13O3TdzXo)
10 |   
11 |   
12 |   
13 | ## More materials 
14 | * Gittins Index - the less heuristical approach to bandit exploration - [article](http://www.ece.mcgill.ca/~amahaj1/projects/bandits/book/2013-bandit-computations.pdf)
15 | * "Deep" version: variational information maximizing exploration - [video](https://www.youtube.com/watch?v=sRIjxxjVrnY)
16 |   * Same topics in russian - [video](https://yadi.sk/i/_2_0yqeW3HDbcn)
17 | * Lecture covering intrinsically motivated reinforcement learning - [video](https://www.youtube.com/watch?v=aJI_9SoBDaQ)
18 |   * [Slides](https://yadi.sk/i/8sx42nau3HEYKg)
19 |   * Same topics in russian - [video](https://www.youtube.com/watch?v=WCE9hhPbCmc)
20 |   * Note: UCB-1 is not for bernoulli rewards, but for arbitrary r in [0,1], so you can just scale any reward to [0,1] to obtain a peace of mind. It's derived directly from Hoeffding's inequality.
21 | 
22 | ## Seminar
23 | In this seminar, you'll be solvilg basic and contextual bandits with uncertainty-based exploration like Bayesian UCB and Thompson Sampling. 
24 | 
25 | You will also need Bayesian Neural Networks. You will need theano/lasagne for this one:
26 | ```
27 | # either
28 | conda install Theano
29 | # or
30 | pip install --upgrade https://github.com/Theano/Theano/archive/master.zip
31 | # and then lasagne
32 | pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip
33 | ```
34 | 
35 | Everything else is in the notebook :)
36 | 


--------------------------------------------------------------------------------
/week5_explore/action_rewards.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week5_explore/action_rewards.npy


--------------------------------------------------------------------------------
/week5_explore/all_states.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week5_explore/all_states.npy


--------------------------------------------------------------------------------
/week5_explore/bayes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A single-file module that makes your lasagne network into a bayesian neural net.
  3 | Originally created by github.com/ferrine , rewritten by github.com/justheuristic for simplicity
  4 | 
  5 | See example in the notebook
  6 | """
  7 | 
  8 | import numpy as np
  9 | 
 10 | from theano import tensor as T
 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 12 | 
 13 | import lasagne
 14 | from lasagne import init
 15 | from lasagne.random import get_rng
 16 | 
 17 | from functools import wraps
 18 | 
 19 | __all__ = ['NormalApproximation','get_var_cost','bbpwrap']
 20 | 
 21 | 
 22 | 
 23 | class NormalApproximation(object):
 24 |     def __init__(self, mu=0, std=np.exp(-3),seed=None):
 25 |         """
 26 |         Approximation that samples network weights from factorized normal distribution.
 27 |         
 28 |         :param mu: prior mean for gaussian weights
 29 |         :param std: prior std for gaussian weights
 30 |         :param seed: random seed
 31 |         """
 32 |         self.prior_mu = mu
 33 |         self.prior_std = std
 34 |         self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579))
 35 |         
 36 |     def log_normal(self,x, mean, std, eps=0.0):
 37 |         """computes log-proba of normal distribution"""
 38 |         std += eps
 39 |         return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - (x - mean) ** 2 / (2 * std ** 2)
 40 | 
 41 |     def log_prior(self, weights):
 42 |         """
 43 |         Logarithm of prior probabilities for weights: 
 44 |         log P(weights) aka log P(theta)
 45 |         """
 46 |         return self.log_normal(weights, self.prior_mu, self.prior_std)
 47 | 
 48 |     def log_posterior_approx(self,weights, mean, rho):
 49 |         """
 50 |         Logarithm of ELBO on posterior probabilities:
 51 |         log q(weights|learned mu and rho) aka log q(theta|x)
 52 |         """
 53 |         std = T.log1p(T.exp(rho))  #rho to std
 54 |         return self.log_normal(weights, mean, std)
 55 | 
 56 |     def __call__(self, layer, spec, shape, name=None, **tags):
 57 |         # case when user uses default init specs
 58 |         assert tags.get('variational',False) == True, "Please declare param as variational to avoid confusion"
 59 |         
 60 |         if not isinstance(spec, dict):
 61 |             initial_rho = np.log(np.expm1(self.prior_std))   #std to rho
 62 |             assert np.isfinite(initial_rho),"too small std to initialize correctly. Please pass explicit"\
 63 |                                             " initializer (dict with {'mu':mu_init, 'rho':rho_init})."
 64 |             spec = {'mu': spec,'rho':init.Constant(initial_rho)}
 65 |             
 66 | 
 67 |         mu_spec,rho_spec = spec['mu'],spec['rho']
 68 |         
 69 |         rho = layer.add_param(rho_spec, shape,name=(name or 'unk')+'.rho', **tags)
 70 |         mean = layer.add_param(mu_spec, shape,name=(name or 'unk')+'.mu', **tags)
 71 | 
 72 |         #Reparameterization trick
 73 |         e = self.srng.normal(shape, std=1)  
 74 |         W = mean + T.log1p(T.exp(rho)) * e 
 75 | 
 76 |         #KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka variational cost
 77 |         q_p = T.sum(self.log_posterior_approx(W, mean, rho) - self.log_prior(W))
 78 |             
 79 |         #accumulate variational cost
 80 |         layer._bbwrap_var_cost += q_p
 81 |         return W
 82 | 
 83 | 
 84 | 
 85 | def get_var_cost(layer_or_layers,treat_as_input=None):
 86 |     """
 87 |     Returns total variational cost aka KL(q(theta|x)||p(theta)) for all layers in the network
 88 |     
 89 |     :param layer_or_layers: top layer(s) of your network, just like with lasagne.layers.get_output
 90 |     :param treat_as_input: don't accumulate over layers below these layers. See same param for lasagne.layers.get_all_layers
 91 |     
 92 |     Alternatively, one can manually get weights for one layer via layer.get_var_cost()
 93 |     """
 94 |     cost = 0
 95 |     for layer in lasagne.layers.get_all_layers(layer_or_layers,treat_as_input):
 96 |         if hasattr(layer, 'get_var_cost'): #if layer is bayesian or pretends so
 97 |             cost += layer.get_var_cost()
 98 |     return cost
 99 | 
100 | def bbpwrap(approximation=NormalApproximation()):
101 |     """
102 |     A decorator that makes arbitrary lasagne layer into a bayesian network layer:
103 |     BayesDenseLayer = bbwrap()(DenseLayer)
104 |     or more verbosely,
105 |     @bbpwrap(NormalApproximation(pstd=0.01))
106 |     BayesDenseLayer(DenseLayer):
107 |         pass
108 | 
109 |     """
110 |     
111 |     def decorator(cls):
112 |         def add_param_wrap(add_param):
113 |             @wraps(add_param)
114 |             def wrapped(self, spec, shape, name=None, **tags):
115 |                 # we should take care about some user specification
116 |                 # to avoid bbp hook just set tags['variational'] = True
117 |                 if not tags.get('trainable', True) or tags.get('variational', False):
118 |                     return add_param(self, spec, shape, name, **tags)
119 |                 else:
120 |                     # we declare that params we add next
121 |                     # are the ones we need to fit the distribution
122 |                     # they don't need to be regularized, strictly
123 |                     tags['variational'] = True
124 |                     tags['regularizable'] = False
125 |                     param = self.approximation(self, spec, shape, name, **tags)
126 |                     return param
127 |             return wrapped
128 |         
129 |         def get_var_cost(self):
130 |             """
131 |             Returns total variational cost aka KL(q(theta|x)||p(theta)) for this layer.
132 |             Alternatively, use function get_var_cost(layer) to get total cost for all layers below this one.
133 |             """
134 |             return self._bbwrap_var_cost
135 |         
136 | 
137 |         cls.approximation = approximation
138 |         cls._bbwrap_var_cost=0
139 |         cls.add_param = add_param_wrap(cls.add_param)
140 |         cls.get_var_cost = get_var_cost
141 |         return cls
142 |     
143 |     
144 |     return decorator
145 | 


--------------------------------------------------------------------------------
/week5_explore/bnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week5_explore/bnn.png


--------------------------------------------------------------------------------
/week5_explore/river_swim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week5_explore/river_swim.png


--------------------------------------------------------------------------------
/week6_policy_based/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [Slides](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture6.pdf&name=lecture6.pdf&c=58c876c4863a)
 3 | * Video lecture by D. Silver - [video](https://www.youtube.com/watch?v=KHZVXao4qXs)
 4 | * Our [lecture](https://yadi.sk/i/yPIPkO_f3TPsNK),  [seminar(pytorch)](https://yadi.sk/i/flW8ezGk3TPsQ5), [seminar(theano)](https://yadi.sk/i/8f9NX_E73GKBkT)
 5 | * Alternative lecture by J. Schulman part 1 - [video](https://www.youtube.com/watch?v=BB-BhTn6DCM)
 6 | * Alternative lecture by J. Schulman part 2 - [video](https://www.youtube.com/watch?v=Wnl-Qh2UHGg)
 7 | 
 8 | 
 9 | ## More materials
10 | * Actually proving the policy gradient for discounted rewards - [article](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf)
11 | * On variance of policy gradient and optimal baselines: [article](https://papers.nips.cc/paper/4264-analysis-and-improvement-of-policy-gradient-estimation.pdf), another [article](https://arxiv.org/pdf/1301.2315.pdf)
12 | * Generalized Advantage Estimation - a way you can speed up training for homework_*.ipynb - [article](https://arxiv.org/abs/1506.02438)
13 | 
14 | 
15 | * Generalizing log-derivative trick - [url](http://blog.shakirm.com/2015/11/machine-learning-trick-of-the-day-5-log-derivative-trick/)
16 | * Combining policy gradient and q-learning - [arxiv](https://arxiv.org/abs/1611.01626)
17 | * Bayesian perspective on why reparameterization & logderivative tricks matter (Vetrov's take) - [pdf](https://www.sdsj.ru/slides/Vetrov.pdf)
18 | * Adversarial review of policy gradient - [blog](http://www.argmin.net/2018/02/20/reinforce/)
19 | 
20 | 
21 | ## Homework
22 | 
23 | As usual, pick reinfoce_<framework_name>.ipynb for starters and then proceed with homework_<framework_name>.ipynb.
24 | 
25 | 


--------------------------------------------------------------------------------
/week6_policy_based/atari_util.py:
--------------------------------------------------------------------------------
 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient"""
 2 | import numpy as np
 3 | import gym
 4 | from scipy.misc import imresize
 5 | from gym.core import Wrapper
 6 | from gym.spaces.box import Box
 7 | 
 8 | class PreprocessAtari(Wrapper):
 9 |     def __init__(self, env, height=42, width=42, color=False, crop=lambda img: img, 
10 |                  n_frames=4, dim_order='theano', reward_scale=1,):
11 |         """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
12 |         super(PreprocessAtari, self).__init__(env)
13 |         assert dim_order in ('theano', 'tensorflow')
14 |         self.img_size = (height, width)
15 |         self.crop=crop
16 |         self.color=color
17 |         self.dim_order = dim_order
18 |         self.reward_scale = reward_scale
19 |         
20 |         n_channels = (3 * n_frames) if color else n_frames
21 |         obs_shape = [n_channels,height,width] if dim_order == 'theano' else [height,width,n_channels]
22 |         self.observation_space = Box(0.0, 1.0, obs_shape)
23 |         self.framebuffer = np.zeros(obs_shape, 'float32')
24 |         
25 |     def reset(self):
26 |         """resets breakout, returns initial frames"""
27 |         self.framebuffer = np.zeros_like(self.framebuffer)
28 |         self.update_buffer(self.env.reset())
29 |         return self.framebuffer
30 |     
31 |     def step(self,action):
32 |         """plays breakout for 1 step, returns frame buffer"""
33 |         new_img, reward, done, info = self.env.step(action)
34 |         self.update_buffer(new_img)
35 |         return self.framebuffer, reward * self.reward_scale, done, info
36 |     
37 |     ### image processing ###
38 |     
39 |     def update_buffer(self,img):
40 |         img = self.preproc_image(img)
41 |         offset = 3 if self.color else 1
42 |         if self.dim_order == 'theano':
43 |             axis = 0
44 |             cropped_framebuffer = self.framebuffer[:-offset]
45 |         else:
46 |             axis = -1
47 |             cropped_framebuffer = self.framebuffer[:,:,:-offset]
48 |         self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)
49 | 
50 |     def preproc_image(self, img):
51 |         """what happens to the observation"""
52 |         img = self.crop(img)
53 |         img = imresize(img, self.img_size)
54 |         if not self.color:
55 |             img = img.mean(-1, keepdims=True)
56 |         if self.dim_order == 'theano':
57 |             img = img.transpose([2,0,1]) # [h, w, c] to [c, h, w]
58 |         img = img.astype('float32') / 255.
59 |         return img
60 | 


--------------------------------------------------------------------------------
/week7_[recap]_rnn/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [Slides](https://yadi.sk/i/-Iqdhg483GDyoN)
 3 | * CS231 lecture on RNNs - [video](https://www.youtube.com/watch?v=iX5V1WpxxkY)
 4 | * Our [lecture](https://yadi.sk/i/XHmT5hO53GcCKV), [seminar(pytorch)](https://yadi.sk/i/nCch5I8S3TsXh5), [seminar(theano)](https://yadi.sk/i/19twHESN3GcGKQ) (both russian)
 5 | * [alternative] Brief lecture on RNN by nervana - [video](https://www.youtube.com/watch?v=Ukgii7Yd_cU)
 6 | * [alternative] More detailed lecture by Y. Bengio - [video](https://www.youtube.com/watch?v=xK-bzjIQkmM)
 7 | * Great reading by Karpathy - [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)
 8 | * LSTM explained in detail by colah - [blog post](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 9 | 
10 | ## More materials
11 | * Seq2seq lecture - [video](https://www.youtube.com/watch?v=G5RY_SUJih4)
12 | * "Awesome rnn" entry point - [repo](https://github.com/kjw0612/awesome-rnn)
13 | * OpenAI research on sentiment analysis that sheds some light on what's inside LSTM language model.
14 | 
15 | # Homework description
16 | 
17 | This week's practice gets you acquainted with basics of recurrent neural networks. For simplicity, we'll train them on character language modelling task. Pick any one of `seminar_lasagne`, `seminar_lasagne_ingraph` or `seminar_tf`.
18 | 
19 | As for difference btwn `seminar_lasagne` and `seminar_lasagne_ingraph` - ingraph version shows a lower-level interface to recurrent neural networks. It also requires you to install `pip install https://github.com/yandexdataschool/agentnet/archive/master.zip`. Out-of-graph version cover higher-level syntax from native lasagne.
20 | 


--------------------------------------------------------------------------------
/week7_[recap]_rnn/rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week7_[recap]_rnn/rnn.png


--------------------------------------------------------------------------------
/week7_pomdp/README.md:
--------------------------------------------------------------------------------
 1 | # Materials
 2 | [lecture slides](https://yadi.sk/d/RGx8BUCr3Gq6DC)
 3 | 
 4 | _Links on all articles mentioned during the lecture could be found in "References" at the very end of the presentation slides. All other interesing links which contribute to the topic of POMDP are presented below_
 5 | 
 6 | ## Basics
 7 | * Our [lecture](https://yadi.sk/i/AHzpTjiT3U8L8e) and [seminar](https://yadi.sk/i/Ka-I7nBp3U8LAG) (russian)
 8 | * A lecture on basics by Andrew NG (english, LQ) - [video](https://www.youtube.com/watch?v=yCqPMD6coO8)
 9 | * A lecture on basics by 5vision (russian) - [video](https://www.youtube.com/watch?v=_dkaynuKUFE)
10 | * _[alternative]_ Chalkboard-style 2-part lecture by B. Ravindran.  - [part1](https://www.youtube.com/watch?v=9G_KevA8DFY), [part2](https://www.youtube.com/watch?v=dMOUp7YzUpQ)
11 | * _[alternative]_ Yet another mini-lecture touching on POMDP by S.S. Baveja - [video](https://www.youtube.com/watch?v=SE56KgF7aVc)
12 | 
13 | ## POMDP Learning
14 | * DRQN lecture by Fritz448 (russian) - [video](https://www.youtube.com/watch?v=bE5DIJvZexc)
15 | * [Data efficient learning in continous POMDP](https://arxiv.org/abs/1602.02523v1)
16 | * [Managing wind farms with bayesian POMDP](http://ascelibrary.org/doi/abs/10.1061/(ASCE)CP.1943-5487.0000390)
17 | * [Bayesian learning and decision-making in dynamic environments](http://www.jmlr.org/papers/volume12/ross11a/ross11a.pdf)
18 | 
19 | 
20 | 
21 | 
22 | ---
23 | 
24 | # Practice
25 | 
26 | 
27 | The assignment is platform and framewerk independent, so choose the framework that suits you best, but pay attention on how many you will need to implement youself in case of nonstandart ones.
28 | 


--------------------------------------------------------------------------------
/week7_pomdp/atari_util.py:
--------------------------------------------------------------------------------
 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient"""
 2 | import numpy as np
 3 | import gym
 4 | from scipy.misc import imresize
 5 | from gym.core import Wrapper
 6 | from gym.spaces.box import Box
 7 | 
 8 | class PreprocessAtari(Wrapper):
 9 |     def __init__(self, env, height=42, width=42, color=False,
10 |                  crop=lambda img: img, n_frames=4, dim_order='theano'):
11 |         """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
12 |         super(PreprocessAtari, self).__init__(env)
13 |         assert dim_order in ('theano', 'tensorflow')
14 |         self.img_size = (height,width)
15 |         self.crop=crop
16 |         self.color=color
17 |         self.dim_order = dim_order
18 |         
19 |         n_channels = (3 * n_frames) if color else n_frames
20 |         obs_shape = [n_channels,height,width] if dim_order == 'theano' else [height,width,n_channels]
21 |         self.observation_space = Box(0.0, 1.0, obs_shape)
22 |         self.framebuffer = np.zeros(obs_shape, 'float32')
23 |         
24 |     def reset(self):
25 |         """resets breakout, returns initial frames"""
26 |         self.framebuffer = np.zeros_like(self.framebuffer)
27 |         self.update_buffer(self.env.reset())
28 |         return self.framebuffer
29 |     
30 |     def step(self,action):
31 |         """plays breakout for 1 step, returns frame buffer"""
32 |         new_img,r,done,info = self.env.step(action)
33 |         self.update_buffer(new_img)
34 |         return self.framebuffer,r,done,info
35 |     
36 |     ### image processing ###
37 |     
38 |     def update_buffer(self,img):
39 |         img = self.preproc_image(img)
40 |         offset = 3 if self.color else 1
41 |         if self.dim_order == 'theano':
42 |             axis = 0
43 |             cropped_framebuffer = self.framebuffer[:-offset]
44 |         else:
45 |             axis = -1
46 |             cropped_framebuffer = self.framebuffer[:,:,:-offset]
47 |         self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)
48 | 
49 |     def preproc_image(self, img):
50 |         """what happens to the observation"""
51 |         img = self.crop(img)
52 |         img = imresize(img, self.img_size)
53 |         if not self.color:
54 |             img = img.mean(-1, keepdims=True)
55 |         if self.dim_order == 'theano':
56 |             img = img.transpose([2,0,1]) # [h, w, c] to [c, h, w]
57 |         img = img.astype('float32')/255.
58 |         return img
59 | 


--------------------------------------------------------------------------------
/week7_pomdp/env_pool.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A thin wrapper for openAI gym environments that maintains a set of parallel games and has a method to generate
 3 | interaction sessions given agent one-step applier function.
 4 | """
 5 | 
 6 | import numpy as np
 7 | 
 8 | # A whole lot of space invaders
 9 | class EnvPool(object):
10 |     def __init__(self, agent, make_env, n_parallel_games=1):
11 |         """
12 |         A special class that handles training on multiple parallel sessions
13 |         and is capable of some auxilary actions like evaluating agent on one game session (See .evaluate()).
14 | 
15 |         :param agent: Agent which interacts with the environment.
16 |         :param make_env: Factory that produces environments OR a name of the gym environment.
17 |         :param n_games: Number of parallel games. One game by default.
18 |         :param max_size: Max pool size by default (if appending sessions). By default, pool is not constrained in size.
19 |         """
20 |         # Create atari games.
21 |         self.agent = agent
22 |         self.make_env = make_env
23 |         self.envs = [self.make_env() for _ in range(n_parallel_games)]
24 | 
25 |         # Initial observations.
26 |         self.prev_observations = [env.reset() for env in self.envs]
27 | 
28 |         # Agent memory variables (if you use recurrent networks).
29 |         self.prev_memory_states = agent.get_initial_state(n_parallel_games)
30 | 
31 |         # Whether particular session has just been terminated and needs restarting.
32 |         self.just_ended = [False] * len(self.envs)
33 | 
34 |     def interact(self, n_steps=100, verbose=False):
35 |         """Generate interaction sessions with ataries (openAI gym atari environments)
36 |         Sessions will have length n_steps. Each time one of games is finished, it is immediately getting reset
37 |         and this time is recorded in is_alive_log (See returned values).
38 | 
39 |         :param n_steps: Length of an interaction.
40 |         :returns: observation_seq, action_seq, reward_seq, is_alive_seq
41 |         :rtype: a bunch of tensors [batch, tick, ...]
42 |         """
43 | 
44 |         def env_step(i, action):
45 |             if not self.just_ended[i]:
46 |                 new_observation, cur_reward, is_done, info = self.envs[i].step(action)
47 |                 if is_done:
48 |                     # Game ends now, will finalize on next tick.
49 |                     self.just_ended[i] = True
50 | 
51 |                 # note: is_alive=True in any case because environment is still alive (last tick alive) in our notation.
52 |                 return new_observation, cur_reward, True, info
53 |             else:
54 |                 # Reset environment, get new observation to be used on next tick.
55 |                 new_observation = self.envs[i].reset()
56 | 
57 |                 # Reset memory for new episode.
58 |                 initial_memory_state = self.agent.get_initial_state(batch_size=1)
59 |                 for m_i in range(len(new_memory_states)):
60 |                     new_memory_states[m_i][i] = initial_memory_state[m_i][0]
61 | 
62 |                 if verbose:
63 |                     print("env %i reloaded" % i)
64 | 
65 |                 self.just_ended[i] = False
66 | 
67 |                 return new_observation, 0, False, {'end': True}
68 | 
69 |         history_log = []
70 | 
71 |         for i in range(n_steps - 1):
72 |             new_memory_states, readout = self.agent.step(self.prev_memory_states, self.prev_observations)
73 |             actions = self.agent.sample_actions(readout)
74 | 
75 |             new_observations, cur_rewards, is_alive, infos = zip(*map(env_step, range(len(self.envs)), actions))
76 | 
77 |             # Append data tuple for this tick.
78 |             history_log.append((self.prev_observations, actions, cur_rewards, is_alive))
79 | 
80 |             self.prev_observations = new_observations
81 |             self.prev_memory_states = new_memory_states
82 |         
83 |         #add last observation
84 |         dummy_actions = [0] * len(self.envs)
85 |         dummy_rewards = [0] * len(self.envs)
86 |         dummy_mask = [1] * len(self.envs)
87 |         history_log.append((self.prev_observations, dummy_actions, dummy_rewards, dummy_mask))
88 | 
89 |         # cast to numpy arrays, transpose from [time, batch, ...] to [batch, time, ...]
90 |         history_log = [np.array(tensor).swapaxes(0, 1) for tensor in zip(*history_log)]
91 |         observation_seq, action_seq, reward_seq, is_alive_seq = history_log
92 | 
93 |         return observation_seq, action_seq, reward_seq, is_alive_seq


--------------------------------------------------------------------------------
/week7_pomdp/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week7_pomdp/img1.jpg


--------------------------------------------------------------------------------
/week7_pomdp/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week7_pomdp/img2.jpg


--------------------------------------------------------------------------------
/week7_pomdp/img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/week7_pomdp/img3.jpg


--------------------------------------------------------------------------------
/week8_scst/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [Slides](https://yadi.sk/i/2oUkKL8m3UFFe8)
 3 | * Our [lecture](https://yadi.sk/i/hmjUfKht3UNCSq) & [seminar](https://yadi.sk/i/dTkWTFNj3UNCTv) (russian)
 4 | * English lectures
 5 |   * Lecture by Mohammad Norouzi - [cs294 video](https://www.youtube.com/watch?v=fZNyHoXgV7M&index=24&list=PLkFD6_40KJIwTmSbCv9OVJB3YaO4sFwkX)
 6 |   * Optional lecture on conversation systems - [video](https://www.youtube.com/watch?v=2tKNpzUvDc4	)
 7 | * Will hopefully record our lecture in english soon!
 8 | * Self-critical sequence traning [original article](https://arxiv.org/abs/1612.00563)
 9 | 
10 | ## Practice
11 | As usual, go to practice_{your framework}.ipynb above and follow instructions from there. [pytorch](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/practice_torch.ipynb), [tensorflow](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/practice_tf.ipynb), [theano](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/practice_theano.ipynb)
12 | 
13 | Binder quickstart (lasts 1 hour): [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/yandexdataschool/Practical_RL/master)
14 | 
15 | ## More materials
16 | * An [awesome post](http://distill.pub/2016/augmented-rnns/) explaining attention and long-term memory models.
17 | * [BLEU](http://www.aclweb.org/anthology/P02-1040.pdf) and [CIDEr](https://arxiv.org/pdf/1411.5726.pdf) articles.
18 | * Image captioning
19 |   * MSCOCO captioning [challenge](http://mscoco.org/dataset/#captions-challenge2015)
20 |   * Captioning baseline [notebook](https://github.com/yandexdataschool/HSE_deeplearning/blob/master/week7/captioning_solution_ars.ipynb)
21 | * Other articles on reinforcement learning for natural language: 
22 |   * [task-oriented conversation system](https://arxiv.org/abs/1703.07055)
23 |   * [generating dialogues](https://arxiv.org/abs/1606.01541)
24 |   * [sequential adversarial networks](https://arxiv.org/abs/1609.05473) (a.k.a. SeqGAN)
25 |   * A large overview for machine translation (touching on RL, including RL failures) - [arxiv](https://arxiv.org/abs/1609.08144)
26 |   * How _not_ to evaluate conversation models - [arxiv](https://arxiv.org/abs/1603.08023)
27 | * Overview of other non-games applications ("that article again") - [arxiv](https://arxiv.org/abs/1701.07274)
28 | 
29 | 


--------------------------------------------------------------------------------
/week8_scst/basic_model_tf.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import keras.layers as L
  3 | 
  4 | # This code implements a single-GRU seq2seq model. You will have to improve it later in the assignment.
  5 | # Note 1: when using several recurrent layers TF can mixed up the weights of different recurrent layers.
  6 | # In that case, make sure you both create AND use each rnn/gru/lstm/custom layer in a unique variable scope
  7 | # e.g. with tf.variable_scope("first_lstm"): new_cell, new_out = self.lstm_1(...)
  8 | #      with tf.variable_scope("second_lstm"): new_cell2, new_out2 = self.lstm_2(...)
  9 | # Note 2: everything you need for decoding should be stored in model state (output list of both encode and decode)
 10 | # e.g. for attention, you should store all encoder sequence and input mask there in addition to lstm/gru states.
 11 | 
 12 | class BasicTranslationModel:
 13 |     def __init__(self, name, inp_voc, out_voc,
 14 |                  emb_size, hid_size,):
 15 | 
 16 |         self.name = name
 17 |         self.inp_voc = inp_voc
 18 |         self.out_voc = out_voc
 19 | 
 20 |         with tf.variable_scope(name):
 21 |             self.emb_inp = L.Embedding(len(inp_voc), emb_size)
 22 |             self.emb_out = L.Embedding(len(out_voc), emb_size)
 23 |             self.enc0 = tf.nn.rnn_cell.GRUCell(hid_size)
 24 |             self.dec_start = L.Dense(hid_size)
 25 |             self.dec0 = tf.nn.rnn_cell.GRUCell(hid_size)
 26 |             self.logits = L.Dense(len(out_voc))
 27 | 
 28 | 
 29 |             # run on dummy output to .build all layers (and therefore create weights)
 30 |             inp = tf.placeholder('int32', [None, None])
 31 |             out = tf.placeholder('int32', [None, None])
 32 |             h0 = self.encode(inp)
 33 |             h1 = self.decode(h0,out[:,0])
 34 |             # h2 = self.decode(h1,out[:,1]) etc.
 35 | 
 36 |         self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
 37 | 
 38 | 
 39 |     def encode(self, inp, **flags):
 40 |         """
 41 |         Takes symbolic input sequence, computes initial state
 42 |         :param inp: matrix of input tokens [batch, time]
 43 |         :return: a list of initial decoder state tensors
 44 |         """
 45 |         inp_lengths = infer_length(inp, self.inp_voc.eos_ix)
 46 |         inp_emb = self.emb_inp(inp)
 47 | 
 48 |         _, enc_last = tf.nn.dynamic_rnn(
 49 |                           self.enc0, inp_emb,
 50 |                           sequence_length=inp_lengths,
 51 |                           dtype = inp_emb.dtype)
 52 | 
 53 |         dec_start = self.dec_start(enc_last)
 54 |         return [dec_start]
 55 | 
 56 |     def decode(self, prev_state, prev_tokens, **flags):
 57 |         """
 58 |         Takes previous decoder state and tokens, returns new state and logits
 59 |         :param prev_state: a list of previous decoder state tensors
 60 |         :param prev_tokens: previous output tokens, an int vector of [batch_size]
 61 |         :return: a list of next decoder state tensors, a tensor of logits [batch,n_tokens]
 62 |         """
 63 | 
 64 |         [prev_dec] = prev_state
 65 | 
 66 |         prev_emb = self.emb_out(prev_tokens[:,None])[:,0]
 67 | 
 68 |         new_dec_out,new_dec_state = self.dec0(prev_emb, prev_dec)
 69 | 
 70 |         output_logits = self.logits(new_dec_out)
 71 | 
 72 |         return [new_dec_state], output_logits
 73 | 
 74 |     def symbolic_score(self, inp, out, eps=1e-30, **flags):
 75 |         """
 76 |         Takes symbolic int32 matrices of hebrew words and their english translations.
 77 |         Computes the log-probabilities of all possible english characters given english prefices and hebrew word.
 78 |         :param inp: input sequence, int32 matrix of shape [batch,time]
 79 |         :param out: output sequence, int32 matrix of shape [batch,time]
 80 |         :return: log-probabilities of all possible english characters of shape [bath,time,n_tokens]
 81 | 
 82 |         NOTE: log-probabilities time axis  is synchronized with out
 83 |         In other words, logp are probabilities of __current__ output at each tick, not the next one
 84 |         therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens)
 85 |         """
 86 |         first_state = self.encode(inp,**flags)
 87 | 
 88 |         batch_size = tf.shape(inp)[0]
 89 |         bos = tf.fill([batch_size],self.out_voc.bos_ix)
 90 |         first_logits = tf.log(tf.one_hot(bos, len(self.out_voc)) + eps)
 91 | 
 92 |         def step(blob, y_prev):
 93 |             h_prev = blob[:-1]
 94 |             h_new, logits = self.decode(h_prev, y_prev, **flags)
 95 |             return list(h_new) + [logits]
 96 | 
 97 |         results = tf.scan(step,initializer=list(first_state)+[first_logits],
 98 |                           elems=tf.transpose(out))
 99 | 
100 |         # gather state and logits, each of shape [time,batch,...]
101 |         states_seq, logits_seq = results[:-1], results[-1]
102 | 
103 |         # add initial state and logits
104 |         logits_seq = tf.concat((first_logits[None], logits_seq),axis=0)
105 |         states_seq = [tf.concat((init[None], states), axis=0)
106 |                       for init, states in zip(first_state, states_seq)]
107 | 
108 |         #convert from [time,batch,...] to [batch,time,...]
109 |         logits_seq = tf.transpose(logits_seq, [1, 0, 2])
110 |         states_seq = [tf.transpose(states, [1, 0] + list(range(2, states.shape.ndims)))
111 |                       for states in states_seq]
112 | 
113 |         return tf.nn.log_softmax(logits_seq)
114 | 
115 |     def symbolic_translate(self, inp, greedy=False, max_len = None, eps = 1e-30, **flags):
116 |         """
117 |         takes symbolic int32 matrix of hebrew words, produces output tokens sampled
118 |         from the model and output log-probabilities for all possible tokens at each tick.
119 |         :param inp: input sequence, int32 matrix of shape [batch,time]
120 |         :param greedy: if greedy, takes token with highest probablity at each tick.
121 |             Otherwise samples proportionally to probability.
122 |         :param max_len: max length of output, defaults to 2 * input length
123 |         :return: output tokens int32[batch,time] and
124 |                  log-probabilities of all tokens at each tick, [batch,time,n_tokens]
125 |         """
126 |         first_state = self.encode(inp, **flags)
127 | 
128 |         batch_size = tf.shape(inp)[0]
129 |         bos = tf.fill([batch_size],self.out_voc.bos_ix)
130 |         first_logits = tf.log(tf.one_hot(bos, len(self.out_voc)) + eps)
131 |         max_len = tf.reduce_max(tf.shape(inp)[1])*2
132 | 
133 |         def step(blob,t):
134 |             h_prev, y_prev = blob[:-2], blob[-1]
135 |             h_new, logits = self.decode(h_prev, y_prev, **flags)
136 |             y_new = tf.argmax(logits,axis=-1) if greedy else tf.multinomial(logits,1)[:,0]
137 |             return list(h_new) + [logits, tf.cast(y_new,y_prev.dtype)]
138 | 
139 |         results = tf.scan(step, initializer=list(first_state) + [first_logits, bos],
140 |                           elems=[tf.range(max_len)])
141 | 
142 |         # gather state, logits and outs, each of shape [time,batch,...]
143 |         states_seq, logits_seq, out_seq = results[:-2], results[-2], results[-1]
144 | 
145 |         # add initial state, logits and out
146 |         logits_seq = tf.concat((first_logits[None],logits_seq),axis=0)
147 |         out_seq = tf.concat((bos[None], out_seq), axis=0)
148 |         states_seq = [tf.concat((init[None], states), axis=0)
149 |                       for init, states in zip(first_state, states_seq)]
150 | 
151 |         #convert from [time,batch,...] to [batch,time,...]
152 |         logits_seq = tf.transpose(logits_seq, [1, 0, 2])
153 |         out_seq = tf.transpose(out_seq)
154 |         states_seq = [tf.transpose(states, [1, 0] + list(range(2, states.shape.ndims)))
155 |                       for states in states_seq]
156 | 
157 |         return out_seq, tf.nn.log_softmax(logits_seq)
158 | 
159 | 
160 | 
161 | ### Utility functions ###
162 | 
163 | def initialize_uninitialized(sess = None):
164 |     """
165 |     Initialize unitialized variables, doesn't affect those already initialized
166 |     :param sess: in which session to initialize stuff. Defaults to tf.get_default_session()
167 |     """
168 |     sess = sess or tf.get_default_session()
169 |     global_vars          = tf.global_variables()
170 |     is_not_initialized   = sess.run([tf.is_variable_initialized(var) for var in global_vars])
171 |     not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
172 | 
173 |     if len(not_initialized_vars):
174 |         sess.run(tf.variables_initializer(not_initialized_vars))
175 | 
176 | def infer_length(seq, eos_ix, time_major=False, dtype=tf.int32):
177 |     """
178 |     compute length given output indices and eos code
179 |     :param seq: tf matrix [time,batch] if time_major else [batch,time]
180 |     :param eos_ix: integer index of end-of-sentence token
181 |     :returns: lengths, int32 vector of shape [batch]
182 |     """
183 |     axis = 0 if time_major else 1
184 |     is_eos = tf.cast(tf.equal(seq, eos_ix), dtype)
185 |     count_eos = tf.cumsum(is_eos,axis=axis,exclusive=True)
186 |     lengths = tf.reduce_sum(tf.cast(tf.equal(count_eos,0),dtype),axis=axis)
187 |     return lengths
188 | 
189 | def infer_mask(seq, eos_ix, time_major=False, dtype=tf.float32):
190 |     """
191 |     compute mask given output indices and eos code
192 |     :param seq: tf matrix [time,batch] if time_major else [batch,time]
193 |     :param eos_ix: integer index of end-of-sentence token
194 |     :returns: mask, float32 matrix with '0's and '1's of same shape as seq
195 |     """
196 |     axis = 0 if time_major else 1
197 |     lengths = infer_length(seq, eos_ix, time_major=time_major)
198 |     mask = tf.sequence_mask(lengths, maxlen=tf.shape(seq)[axis], dtype=dtype)
199 |     if time_major: mask = tf.transpose(mask)
200 |     return mask
201 | 
202 | 
203 | def select_values_over_last_axis(values, indices):
204 |     """
205 |     Auxiliary function to select logits corresponding to chosen tokens.
206 |     :param values: logits for all actions: float32[batch,tick,action]
207 |     :param indices: action ids int32[batch,tick]
208 |     :returns: values selected for the given actions: float[batch,tick]
209 |     """
210 |     assert values.shape.ndims == 3 and indices.shape.ndims == 2
211 |     batch_size, seq_len = tf.shape(indices)[0], tf.shape(indices)[1]
212 |     batch_i = tf.tile(tf.range(0,batch_size)[:, None],[1,seq_len])
213 |     time_i = tf.tile(tf.range(0,seq_len)[None, :],[batch_size,1])
214 |     indices_nd = tf.stack([batch_i, time_i, indices], axis=-1)
215 | 
216 |     return tf.gather_nd(values,indices_nd)
217 | 
218 | 
219 | 
220 | 


--------------------------------------------------------------------------------
/week8_scst/basic_model_theano.py:
--------------------------------------------------------------------------------
  1 | # code by https://github.com/deniskamazur
  2 | 
  3 | from lasagne.layers import *
  4 | import theano.tensor as T
  5 | import theano
  6 | 
  7 | from agentnet.memory import LSTMCell, GRUCell, AttentionLayer
  8 | from agentnet import Recurrence
  9 | from agentnet.learning.generic import get_mask_by_eos
 10 | from agentnet.resolver import ProbabilisticResolver
 11 | from agentnet.utils import reapply
 12 | 
 13 | 
 14 | class BasicTranslationModel:
 15 |     def __init__(self, inp_voc, out_voc, emb_size, hid_size, **kwargs):
 16 |         """
 17 |         A simple interface for mt
 18 |         :param emb_size: Embedding size
 19 |         :param hid_size: Number of LSTM units
 20 |         :param bidereactional: If the nLSTM layers should be bidirectional
 21 |         :param input_dropout: Dropout after embedding layer
 22 |         :param recurrent_dropout: Dropout after each LSTM iteration
 23 |         :param rdo_size: If int - use dense layer after neck in decoder, if none don't
 24 |         :param peepholes: http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-var-peepholes.png
 25 |         :param kwargs: recurrence flags
 26 |         """
 27 |         self.inp_voc = inp_voc
 28 |         self.out_voc = out_voc
 29 |         # encode input sequence
 30 |         class encoder:
 31 |             # intput layers
 32 |             inp = InputLayer((None, None))
 33 |             mask = ExpressionLayer(inp, lambda x: get_mask_by_eos(T.eq(x, self.out_voc.eos_ix)))
 34 | 
 35 |             # embed the tokens
 36 |             emb = EmbeddingLayer(inp, input_size=len(inp_voc),
 37 |                                  output_size=emb_size)
 38 | 
 39 |             rnn_fw = GRULayer(emb, num_units=hid_size, mask_input=mask,
 40 |                               only_return_final=True)
 41 | 
 42 |             dec_start = DenseLayer(rnn_fw,hid_size,nonlinearity=None)
 43 | 
 44 |         # make encoder a public field
 45 |         self.encoder = encoder
 46 | 
 47 |         # decoder the encoded sequence
 48 |         class decoder:
 49 |             # decoder previous memory and tokens
 50 |             prev_hid = InputLayer((None, hid_size), name='prev hidden state')
 51 |             inp = InputLayer((None,), name="prev phoneme")
 52 | 
 53 |             emb = EmbeddingLayer(inp, len(out_voc), emb_size)
 54 | 
 55 |             new_hid = GRUCell(prev_hid, emb)
 56 | 
 57 |             logits = DenseLayer(new_hid, len(out_voc), nonlinearity=None)
 58 | 
 59 |             probs = NonlinearityLayer(logits, nonlinearity=T.nnet.softmax)
 60 |             logprobs = NonlinearityLayer(logits, nonlinearity=T.nnet.logsoftmax)
 61 |             out = ProbabilisticResolver(probs, assume_normalized=True)
 62 | 
 63 |             state_dict = {
 64 |                 new_hid: prev_hid,
 65 |                 # ^^^ this reads "at next step, new_hid will become prev_hid"
 66 |                 # if you add any more recurrent memory units,
 67 |                 # please make sure they're here
 68 |             }
 69 | 
 70 |             init_dict = {
 71 |                 new_hid:encoder.dec_start
 72 |                 # ^^^ this reads "before first step, new_hid is set to outputs of dec_start"
 73 |                 # if you add any more recurrent memory units with non-zero init
 74 |                 # please make sure they're here
 75 |             }
 76 | 
 77 |             nonseq_dict = {
 78 |                 # here you can add anything encoder needs that's gonna be same across time-steps
 79 |             }
 80 | 
 81 |         self.decoder = decoder
 82 | 
 83 |         top_layers = [encoder.dec_start,decoder.out] + list(decoder.state_dict.keys())
 84 |         self.weights = get_all_params(top_layers, trainable=True)
 85 | 
 86 |     def symbolic_score(self, inp, out, eps=1e-30, **flags):
 87 |         """
 88 |         Takes symbolic int32 matrices of hebrew words and their english translations.
 89 |         Computes the log-probabilities of all possible english characters given english prefices and hebrew word.
 90 |         :param inp: input sequence, int32 matrix of shape [batch,time]
 91 |         :param out: output sequence, int32 matrix of shape [batch,time]
 92 |         :return: log-probabilities of all possible english characters of shape [bath,time,n_tokens]
 93 | 
 94 |         NOTE: log-probabilities time axis  is synchronized with out
 95 |         In other words, logp are probabilities of __current__ output at each tick, not the next one
 96 |         therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens)
 97 |         """
 98 | 
 99 |         l_output_sequence = InputLayer([None,None])
100 | 
101 |         # Defining custom recurrent layer out of decoder
102 |         rec = Recurrence(
103 |             state_variables=self.decoder.state_dict,
104 |             state_init=self.decoder.init_dict,
105 |             input_sequences={self.decoder.inp:l_output_sequence},
106 |             input_nonsequences=self.decoder.nonseq_dict,
107 |             tracked_outputs=self.decoder.logprobs,
108 |             unroll_scan=False
109 |         )
110 | 
111 |         feed_dict = {
112 |             self.encoder.inp: inp,
113 |             l_output_sequence: out
114 |         }
115 |         logprobs = get_output(rec[self.decoder.logprobs], feed_dict,
116 |                               recurrence_flags=flags, **flags)
117 | 
118 |         self.auto_updates = rec.get_automatic_updates()
119 |         if len(self.auto_updates) != 0:
120 |             print("symbolic_score: Please collect auto_updates of random states "
121 |                   "after you called symbolic_score (available at model.auto_updates)!")
122 | 
123 | 
124 |         first_logprobs = T.zeros_like(logprobs[:,:1])
125 |         logprobs = T.concatenate([first_logprobs,logprobs[:,:-1]],axis=1)
126 | 
127 |         return logprobs
128 | 
129 | 
130 | 
131 |     def symbolic_translate(self, inp, greedy=False, max_len = None,
132 |                            unroll_scan=False, eps = 1e-30, **flags):
133 |         """
134 |         takes symbolic int32 matrix of hebrew words, produces output tokens sampled
135 |         from the model and output log-probabilities for all possible tokens at each tick.
136 |         :param inp: input sequence, int32 matrix of shape [batch,time]
137 |         :param greedy: if greedy, takes token with highest probablity at each tick.
138 |             Otherwise samples proportionally to probability.
139 |         :param max_len: max length of output, defaults to 2 * input length
140 |         :param unroll_scan: if True, compiles longer but runs faster.
141 |                             requires max_len to be constant
142 |         :return: output tokens int32[batch,time] and
143 |                  log-probabilities of all tokens at each tick, [batch,time,n_tokens]
144 |         """
145 |         if unroll_scan:
146 |             assert isinstance(max_len,int), "if scan is unrolled, max_len must be a constant integer"
147 | 
148 |         max_len = max_len if max_len is not None else 2 * inp.shape[1]
149 | 
150 |         # initial output tokens (BOS)
151 |         bos = T.zeros_like(inp[:, 0]) + self.out_voc.bos_ix
152 |         l_start = InputLayer((None,),bos)
153 | 
154 |         # Defining custom recurrent layer out of decoder
155 |         rec = Recurrence(
156 |             state_variables=merge_dicts(self.decoder.state_dict,
157 |                                         {self.decoder.out: self.decoder.inp}),
158 |             state_init=merge_dicts(self.decoder.init_dict, {self.decoder.out: l_start}),
159 |             input_nonsequences=self.decoder.nonseq_dict,
160 |             tracked_outputs=(self.decoder.out, self.decoder.probs, self.decoder.logprobs),
161 |             n_steps=max_len,
162 |             unroll_scan=unroll_scan
163 |         )
164 | 
165 |         translations, logprobs = get_output(rec[self.decoder.out, self.decoder.logprobs],
166 |                                             {self.encoder.inp:inp,
167 |                                              l_start:bos},
168 |                                             recurrence_flags=dict(flags,greedy=greedy),
169 |                                             **flags)
170 | 
171 |         self.auto_updates = rec.get_automatic_updates()
172 |         if len(self.auto_updates) != 0:
173 |             print("symbolic_translate: Please collect auto_updates of random states "
174 |                   "after you called symbolic_translate (available at model.auto_updates)!")
175 | 
176 |         # add first step (bos)
177 |         translations = T.concatenate([bos[:,None],translations],axis=1)
178 |         first_logprobs = T.zeros_like(logprobs[:,:1])
179 |         logprobs = T.concatenate([first_logprobs,logprobs],axis=1)
180 | 
181 |         return translations,logprobs
182 | 
183 | 
184 | def merge_dicts(*dicts, **kwargs):
185 |     """
186 |     Melts several dicts into one. Useful when messing with feed dicts
187 |     :param dicts: dictionaries
188 |     :param check_conflicts: if True, raises error if several dicts have the same key
189 |                     Otherwise uses the key from the latest dict in *dicts
190 |     :return: a dict that contains k-v pairs from  all *dicts
191 |     """
192 |     merged_dict = {}
193 |     for d in dicts:
194 |         merged_dict.update(d)
195 |     if kwargs.get('check_conflicts'):
196 |         assert len(merged_dict) == sum(map(len, dicts)), 'dicts have duplicate keys'
197 |     return merged_dict
198 | 
199 | 


--------------------------------------------------------------------------------
/week8_scst/basic_model_torch.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | 
  6 | # Note: unlike official pytorch tutorial, this model doesn't process one sample at a time
  7 | # because it's slow on GPU.  instead it uses masks just like ye olde theano/tensorflow.
  8 | # it doesn't use torch.nn.utils.rnn.pack_paded_sequence because reasons.
  9 | 
 10 | class BasicTranslationModel(nn.Module):
 11 |     def __init__(self, inp_voc, out_voc,
 12 |                  emb_size, hid_size,):
 13 |         super(self.__class__, self).__init__()
 14 |         self.inp_voc = inp_voc
 15 |         self.out_voc = out_voc
 16 | 
 17 |         self.emb_inp = nn.Embedding(len(inp_voc), emb_size)
 18 |         self.emb_out = nn.Embedding(len(out_voc), emb_size)
 19 |         self.enc0 = nn.GRU(emb_size, hid_size, batch_first=True)
 20 |         self.dec_start = nn.Linear(hid_size, hid_size)
 21 |         self.dec0 = nn.GRUCell(emb_size, hid_size)
 22 |         self.logits = nn.Linear(hid_size, len(out_voc))
 23 | 
 24 |     def encode(self, inp, **flags):
 25 |         """
 26 |         Takes symbolic input sequence, computes initial state
 27 |         :param inp: a vector of input tokens  (Variable, int64, 1d)
 28 |         :return: a list of initial decoder state tensors
 29 |         """
 30 |         inp_emb = self.emb_inp(inp)
 31 |         enc_seq, _ = self.enc0(inp_emb)
 32 | 
 33 |         # select last element w.r.t. mask
 34 |         end_index = infer_length(inp, self.inp_voc.eos_ix)
 35 |         end_index[end_index >= inp.shape[1]] = inp.shape[1] - 1
 36 |         enc_last = enc_seq[range(0, enc_seq.shape[0]), end_index.detach(), :]
 37 | 
 38 |         dec_start = self.dec_start(enc_last)
 39 |         return [dec_start]
 40 | 
 41 |     def decode(self, prev_state, prev_tokens, **flags):
 42 |         """
 43 |         Takes previous decoder state and tokens, returns new state and logits
 44 |         :param prev_state: a list of previous decoder state tensors
 45 |         :param prev_tokens: previous output tokens, an int vector of [batch_size]
 46 |         :return: a list of next decoder state tensors, a tensor of logits [batch,n_tokens]
 47 |         """
 48 |         [prev_dec] = prev_state
 49 | 
 50 |         prev_emb = self.emb_out(prev_tokens)
 51 |         new_dec_state = self.dec0(prev_emb, prev_dec)
 52 |         output_logits = self.logits(new_dec_state)
 53 | 
 54 |         return [new_dec_state], output_logits
 55 | 
 56 |     def forward(self, inp, out, eps=1e-30, **flags):
 57 |         """
 58 |         Takes symbolic int32 matrices of hebrew words and their english translations.
 59 |         Computes the log-probabilities of all possible english characters given english prefices and hebrew word.
 60 |         :param inp: input sequence, int32 matrix of shape [batch,time]
 61 |         :param out: output sequence, int32 matrix of shape [batch,time]
 62 |         :return: log-probabilities of all possible english characters of shape [bath,time,n_tokens]
 63 | 
 64 |         Note: log-probabilities time axis is synchronized with out
 65 |         In other words, logp are probabilities of __current__ output at each tick, not the next one
 66 |         therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens)
 67 |         """
 68 |         batch_size = inp.shape[0]
 69 |         bos = Variable(torch.LongTensor([self.out_voc.bos_ix] * batch_size))
 70 |         logits_seq = [torch.log(to_one_hot(bos, len(self.out_voc)) + eps)]
 71 | 
 72 |         hid_state = self.encode(inp, **flags)
 73 |         for x_t in out.transpose(0,1)[:-1]:
 74 |             hid_state, logits = self.decode(hid_state, x_t, **flags)
 75 |             logits_seq.append(logits)
 76 | 
 77 |         return F.log_softmax(torch.stack(logits_seq, dim=1), dim=-1)
 78 | 
 79 |     def translate(self, inp, greedy=False, max_len = None, eps = 1e-30, **flags):
 80 |         """
 81 |         takes symbolic int32 matrix of hebrew words, produces output tokens sampled
 82 |         from the model and output log-probabilities for all possible tokens at each tick.
 83 |         :param inp: input sequence, int32 matrix of shape [batch,time]
 84 |         :param greedy: if greedy, takes token with highest probablity at each tick.
 85 |             Otherwise samples proportionally to probability.
 86 |         :param max_len: max length of output, defaults to 2 * input length
 87 |         :return: output tokens int32[batch,time] and
 88 |                  log-probabilities of all tokens at each tick, [batch,time,n_tokens]
 89 |         """
 90 |         batch_size = inp.shape[0]
 91 |         bos = Variable(torch.LongTensor([self.out_voc.bos_ix] * batch_size))
 92 |         mask = Variable(torch.ones(batch_size).type(torch.ByteTensor))
 93 |         logits_seq = [torch.log(to_one_hot(bos, len(self.out_voc)) + eps)]
 94 |         out_seq = [bos]
 95 | 
 96 |         hid_state = self.encode(inp, **flags)
 97 |         while True:
 98 |             hid_state, logits = self.decode(hid_state, out_seq[-1], **flags)
 99 |             if greedy:
100 |                 _, y_t = torch.max(logits, dim=-1)
101 |             else:
102 |                 probs = F.softmax(logits, dim=-1)
103 |                 y_t = torch.multinomial(probs, 1)[:, 0]
104 | 
105 |             logits_seq.append(logits)
106 |             out_seq.append(y_t)
107 |             mask &= y_t != self.out_voc.eos_ix
108 | 
109 |             if not mask.any(): break
110 |             if max_len and len(out_seq) >= max_len: break
111 | 
112 |         return torch.stack(out_seq, 1), F.log_softmax(torch.stack(logits_seq, 1), dim=-1)
113 | 
114 | 
115 | 
116 | ### Utility functions ###
117 | 
118 | def infer_mask(seq, eos_ix, batch_first=True, include_eos=True, type=torch.FloatTensor):
119 |     """
120 |     compute length given output indices and eos code
121 |     :param seq: tf matrix [time,batch] if batch_first else [batch,time]
122 |     :param eos_ix: integer index of end-of-sentence token
123 |     :param include_eos: if True, the time-step where eos first occurs is has mask = 1
124 |     :returns: lengths, int32 vector of shape [batch]
125 |     """
126 |     assert seq.dim() == 2
127 |     is_eos = (seq == eos_ix).type(torch.FloatTensor)
128 |     if include_eos:
129 |         if batch_first:
130 |             is_eos = torch.cat((is_eos[:,:1]*0, is_eos[:, :-1]), dim=1)
131 |         else:
132 |             is_eos = torch.cat((is_eos[:1,:]*0, is_eos[:-1, :]), dim=0)
133 |     count_eos = torch.cumsum(is_eos, dim=1 if batch_first else 0)
134 |     mask = count_eos == 0
135 |     return mask.type(type)
136 | 
137 | def infer_length(seq, eos_ix, batch_first=True, include_eos=True, type=torch.LongTensor):
138 |     """
139 |     compute mask given output indices and eos code
140 |     :param seq: tf matrix [time,batch] if time_major else [batch,time]
141 |     :param eos_ix: integer index of end-of-sentence token
142 |     :param include_eos: if True, the time-step where eos first occurs is has mask = 1
143 |     :returns: mask, float32 matrix with '0's and '1's of same shape as seq
144 |     """
145 |     mask = infer_mask(seq, eos_ix, batch_first, include_eos, type)
146 |     return torch.sum(mask, dim=1 if batch_first else 0)
147 | 
148 | 
149 | def to_one_hot(y, n_dims=None):
150 |     """ Take integer y (tensor or variable) with n dims and convert it to 1-hot representation with n+1 dims. """
151 |     y_tensor = y.data if isinstance(y, Variable) else y
152 |     y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
153 |     n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1
154 |     y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1)
155 |     y_one_hot = y_one_hot.view(*y.shape, -1)
156 |     return Variable(y_one_hot) if isinstance(y, Variable) else y_one_hot
157 | 
158 | 


--------------------------------------------------------------------------------
/week8_scst/bonus.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Week8 bonus descriptions\n",
  8 |     "\n",
  9 |     "Here are some cool mini-projects you can try to dive deeper into the topic.\n",
 10 |     "\n",
 11 |     "## More metrics: BLEU (5+ pts)\n",
 12 |     "\n",
 13 |     "Pick BLEU or any other relevant metric, e.g. BLEU (e.g. from `nltk.bleu_score`).\n",
 14 |     "* Train model to maximize BLEU directly\n",
 15 |     "* How does levenshtein behave when maximizing BLEU and vice versa?\n",
 16 |     "* Compare this with how they behave when optimizing likelihood. \n",
 17 |     "\n",
 18 |     "(use default parameters for bleu: 4-gram, uniform weights)\n",
 19 |     "\n",
 20 |     "## Actor-critic (5+++ pts)\n",
 21 |     "\n",
 22 |     "While self-critical training provides a large reduction of gradient variance, it has a few drawbacks:\n",
 23 |     "- It requires a lot of additional computation during training\n",
 24 |     "- It doesn't adjust V(s) between decoder steps. (one value per sequence)\n",
 25 |     "\n",
 26 |     "There's a more general way of doing the same thing: learned baselines, also known as __advantage actor-critic__.\n",
 27 |     "\n",
 28 |     "There are two main ways to apply that:\n",
 29 |     "- __naive way__: compute V(s) once per training example.\n",
 30 |     "  - This only requires additional 1-unit linear dense layer that grows out of encoder, estimating V(s)\n",
 31 |     "  - (implement this to get main points)\n",
 32 |     "- __every step__: compute V(s) on each decoder step\n",
 33 |     "  - Again it's just an 1-unit dense layer (no nonlinearity), but this time it's inside decoder recurrence.\n",
 34 |     "  - (+3 pts additional for this guy)\n",
 35 |     "\n",
 36 |     "In both cases, you should train V(s) to minimize squared error $(V(s) - R(s,a))^2$ with R being actual levenshtein.\n",
 37 |     "You can then use $ A(s,a) = (R(s,a) - const(V(s))) $ for policy gradient.\n",
 38 |     "\n",
 39 |     "There's also one particularly interesting approach (+5 additional pts):\n",
 40 |     "- __combining SCST and actor-critic__:\n",
 41 |     "  - compute baseline $V(s)$ via self-critical sequence training (just like in main assignment)\n",
 42 |     "  - learn correction $ C(s,a_{:t}) = R(s,a) - V(s) $ by minimizing $(R(s,a) - V(s) - C(s,a_{:t}))^2 $\n",
 43 |     "  - use $ A(s,a_{:t}) = R(s,a) - V(s) - const(C(s,a_{:t})) $\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "\n",
 47 |     "## Implement attention (5+++ pts)\n",
 48 |     "\n",
 49 |     "Some seq2seq tasks can benefit from the attention mechanism. In addition to taking the _last_ time-step of encoder hidden state, we can allow decoder to peek on any time-step of his choice.\n",
 50 |     "\n",
 51 |     "![img](https://s30.postimg.org/f8um3kt5d/google_seq2seq_attention.gif)\n",
 52 |     "\n",
 53 |     "\n",
 54 |     "#### Recommended steps:\n",
 55 |     "__1)__ Modify encoder-decoder\n",
 56 |     "\n",
 57 |     "Learn to feed the entire encoder into the decoder. You can do so by sending encoder rnn layer directly into decoder (make sure there's no `only_return_final=True` for encoder rnn layer).\n",
 58 |     "\n",
 59 |     "```\n",
 60 |     "class decoder:\n",
 61 |     "    ...\n",
 62 |     "    encoder_rnn_input = InputLayer(encoder.rnn.output_shape, name='encoder rnn input for decoder')\n",
 63 |     "    ...\n",
 64 |     "    \n",
 65 |     "#decoder Recurrence\n",
 66 |     "rec = Recurrence(...,\n",
 67 |     "                 input_nonsequences = {decoder.encoder_rnn_input: encoder.rnn},\n",
 68 |     "                 )\n",
 69 |     "\n",
 70 |     "```\n",
 71 |     "\n",
 72 |     "For starters, you can take it's last tick (via SliceLayer) inside the decoder step and feed it as input to make sure it works.\n",
 73 |     "\n",
 74 |     "__2)__ Implement attention mechanism\n",
 75 |     "\n",
 76 |     "Next thing we'll need is to implement the math of attention.\n",
 77 |     "\n",
 78 |     "The simplest way to do so is to write a special layer. We gave you a prototype and some tests below.\n",
 79 |     "\n",
 80 |     "__3)__ Use attention inside decoder\n",
 81 |     "\n",
 82 |     "That's almost it! Now use `AttentionLayer` inside the decoder and feed it to back to lstm/gru/rnn (see code demo below).\n",
 83 |     "\n",
 84 |     "Train the full network just like you did before attention.\n",
 85 |     "\n",
 86 |     "__More points__ will be awwarded for comparing learning results of attention Vs no attention.\n",
 87 |     "\n",
 88 |     "__Bonus bonus:__ visualize attention vectors (>= +3 points)\n",
 89 |     "\n",
 90 |     "The best way to make sure your attention actually works is to visualize it.\n",
 91 |     "\n",
 92 |     "A simple way to do so is to obtain attention vectors from each tick (values __right after softmax__, not the layer outputs) and drawing those as images.\n",
 93 |     "\n",
 94 |     "#### step-by-step guide:\n",
 95 |     "- split AttentionLayer into two layers: _\"from start to softmax\"_ and _\"from softmax to output\"_\n",
 96 |     "- add outputs of the first layer to recurrence's `tracked_outputs`\n",
 97 |     "- compile a function that computes them\n",
 98 |     "- plt.imshow(them)\n",
 99 |     "\n",
100 |     "\n"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "import numpy as np\n",
112 |     "import theano,lasagne\n",
113 |     "import theano.tensor as T\n",
114 |     "from lasagne import init\n",
115 |     "from lasagne.layers import *"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "class AttentionLayer(MergeLayer):\n",
125 |     "    def __init__(self,decoder_h,encoder_rnn):\n",
126 |     "        #sanity checks\n",
127 |     "        assert len(decoder_h.output_shape)==2,\"please feed decoder 1 step activation as first param \"\n",
128 |     "        assert len(encoder_rnn.output_shape)==3, \"please feed full encoder rnn sequence as second param\"\n",
129 |     "        \n",
130 |     "        self.decoder_num_units = decoder_h.output_shape[-1]\n",
131 |     "        self.encoder_num_units = encoder.output_shape[-1]\n",
132 |     "\n",
133 |     "        #Here you should initialize all trainable parameters.\n",
134 |     "        #\n",
135 |     "        \n",
136 |     "        #use this syntax:\n",
137 |     "        self.add_param(spec=init.Normal(std=0.01), #or other initializer\n",
138 |     "                       shape=<shape tuple>,\n",
139 |     "                       name='<param name here>')\n",
140 |     "        \n",
141 |     "        \n",
142 |     "        MergeLayer.__init__(self,[decoder_h,encoder_rnn],name=\"attention\")\n",
143 |     "        \n",
144 |     "        \n",
145 |     "    def get_output_shape_for(self,input_shapes,**kwargs):\n",
146 |     "        \"\"\"return matrix of shape [batch_size, encoder num units]\"\"\"\n",
147 |     "        return (None,self.encoder_num_units)\n",
148 |     "        \n",
149 |     "    def get_output_for(self,inputs,**kwargs):\n",
150 |     "        \"\"\"\n",
151 |     "        takes (decoder_h, encoder_seq)\n",
152 |     "        decoder_h has shape [batch_size, decoder num_units]\n",
153 |     "        encoder_seq has shape [batch_size, sequence_length, encoder num_units]\n",
154 |     "        \n",
155 |     "        returns attention output: matrix of shape [batch_size, encoder num units]\n",
156 |     "        \n",
157 |     "        please read comments carefully before you start implementing\n",
158 |     "        \"\"\"\n",
159 |     "        decoder_h,encoder_seq = inputs\n",
160 |     "        \n",
161 |     "        #get symbolic batch-size / seq length. Also don't forget self.decoder_num_units above\n",
162 |     "        batch_size,seq_length,_ = tuple(encoder_seq.shape)\n",
163 |     "        \n",
164 |     "        #here's a recommended step-by-step guide for attention mechanism. \n",
165 |     "        #You are free to ignore it alltogether if you so wish\n",
166 |     "        \n",
167 |     "        #we repeat decoder activations to allign with encoder\n",
168 |     "        decoder_h_repeated = <cast decoder_h into [batch,seq_length,decoer_num_units] by \n",
169 |     "                              repeating it _seq_length_ times>\n",
170 |     "                             <use T.repeat and maybe some reshape>\n",
171 |     "        # ^--shape=[batch,seq_length,decoder_n_units]\n",
172 |     "        \n",
173 |     "        encoder_and_decoder_together = <concatenate repeated decoder and encoder over last axis>\n",
174 |     "        # ^--shape=[batch,seq_length,enc_n_units+dec_n_units]\n",
175 |     "        \n",
176 |     "        #here we flatten the tensor to simplify\n",
177 |     "        encoder_and_decoder_flat = T.reshape(encoder_and_decoder_together,(-1,encoder_and_decoder_together.shape[-1]))\n",
178 |     "        # ^--shape=[batch*seq_length,enc_n_units+dec_n_units]\n",
179 |     "        \n",
180 |     "        #here you use encoder_and_decoder_flat and some learned weights to predict attention logits\n",
181 |     "        #don't use softmax yet\n",
182 |     "        <your code here>\n",
183 |     "        attention_logits_flat = <logits to be used as attention weights>\n",
184 |     "        # ^--shape=[batch*seq_length,1]\n",
185 |     "        \n",
186 |     "        \n",
187 |     "        #here we reshape flat logits back into correct form\n",
188 |     "        assert attention_logits_flat.ndim==2\n",
189 |     "        attention_logits = attention_logits_flat.reshape((batch_size,seq_length))\n",
190 |     "        # ^--shape=[batch,seq_length]\n",
191 |     "        \n",
192 |     "        #here we apply softmax :)\n",
193 |     "        attention = T.nnet.softmax(attention_logits)\n",
194 |     "        # ^--shape=[batch,seq_length]\n",
195 |     "        \n",
196 |     "        #here we compute output\n",
197 |     "        output = (attention[:,:,None]*encoder_seq).sum(axis=1) #sum over seq_length\n",
198 |     "        # ^--shape=[batch,enc_n_units]\n",
199 |     "        \n",
200 |     "        return output\n"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "#demo code\n",
210 |     "\n",
211 |     "from numpy.random import randn\n",
212 |     "\n",
213 |     "dec_h_prev = InputLayer((None,50),T.constant(randn(5,50)),name='decoder h mock')\n",
214 |     "\n",
215 |     "enc = InputLayer((None,None,32),T.constant(randn(5,20,32)),name='encoder sequence mock')\n",
216 |     "\n",
217 |     "attention = AttentionLayer(dec_h_prev,enc)\n",
218 |     "\n",
219 |     "#now you can use attention as additonal input to your decoder\n",
220 |     "#LSTMCell(prev_cell,prev_out,input_or_inputs=(usual_input,attention))\n",
221 |     "\n",
222 |     "\n",
223 |     "#sanity check\n",
224 |     "demo_output = get_output(attention).eval()\n",
225 |     "print 'actual shape:',demo_output.shape\n",
226 |     "assert demo_output.shape == (5,32)\n",
227 |     "assert np.isfinite(demo_output)\n",
228 |     "\n"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {
235 |     "collapsed": true
236 |    },
237 |    "outputs": [],
238 |    "source": []
239 |   }
240 |  ],
241 |  "metadata": {
242 |   "kernelspec": {
243 |    "display_name": "Python 2",
244 |    "language": "python",
245 |    "name": "python2"
246 |   },
247 |   "language_info": {
248 |    "codemirror_mode": {
249 |     "name": "ipython",
250 |     "version": 2
251 |    },
252 |    "file_extension": ".py",
253 |    "mimetype": "text/x-python",
254 |    "name": "python",
255 |    "nbconvert_exporter": "python",
256 |    "pygments_lexer": "ipython2",
257 |    "version": "2.7.13"
258 |   }
259 |  },
260 |  "nbformat": 4,
261 |  "nbformat_minor": 2
262 | }
263 | 


--------------------------------------------------------------------------------
/week8_scst/voc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Vocab:
 4 |     def __init__(self, tokens, bos="__BOS__", eos="__EOS__", sep=''):
 5 |         """
 6 |         A special class that handles tokenizing and detokenizing
 7 |         """
 8 |         assert bos in tokens, eos in tokens
 9 |         self.tokens = tokens
10 |         self.token_to_ix = {t:i for i,t in enumerate(tokens)}
11 | 
12 |         self.bos = bos
13 |         self.bos_ix = self.token_to_ix[bos]
14 |         self.eos = eos
15 |         self.eos_ix = self.token_to_ix[eos]
16 |         self.sep = sep
17 | 
18 |     def __len__(self):
19 |         return len(self.tokens)
20 | 
21 |     @staticmethod
22 |     def from_lines(lines, bos="__BOS__", eos="__EOS__", sep=''):
23 |         flat_lines = sep.join(list(lines))
24 |         flat_lines = list(flat_lines.split(sep)) if sep != '' else list(flat_lines)
25 |         tokens = list(set(sep.join(flat_lines)))
26 |         tokens = [t for t in tokens if t not in (bos,eos) and len(t) != 0]
27 |         tokens = [bos,eos] + tokens
28 |         return Vocab(tokens,bos,eos,sep)
29 | 
30 |     def tokenize(self,string):
31 |         """converts string to a list of tokens"""
32 |         tokens = list(filter(len,string.split(self.sep))) \
33 |                     if self.sep != '' else list(string)
34 |         return [self.bos] + tokens + [self.eos]
35 | 
36 |     def to_matrix(self, lines, max_len=None):
37 |         """
38 |         convert variable length token sequences into  fixed size matrix
39 |         example usage:
40 |         >>>print( as_matrix(words[:3],source_to_ix))
41 |         [[15 22 21 28 27 13 -1 -1 -1 -1 -1]
42 |          [30 21 15 15 21 14 28 27 13 -1 -1]
43 |          [25 37 31 34 21 20 37 21 28 19 13]]
44 |         """
45 |         max_len = max_len or max(map(len, lines)) + 2 # 2 for bos and eos
46 | 
47 |         matrix = np.zeros((len(lines), max_len), dtype='int32') + self.eos_ix
48 |         for i, seq in enumerate(lines):
49 |             tokens = self.tokenize(seq)
50 |             row_ix = list(map(self.token_to_ix.get, tokens))[:max_len]
51 |             matrix[i, :len(row_ix)] = row_ix
52 | 
53 |         return matrix
54 | 
55 |     def to_lines(self, matrix, crop=True):
56 |         """
57 |         Convert matrix of token ids into strings
58 |         :param matrix: matrix of tokens of int32, shape=[batch,time]
59 |         :param crop: if True, crops BOS and EOS from line
60 |         :return:
61 |         """
62 |         lines = []
63 |         for line_ix in map(list,matrix):
64 |             if crop:
65 |                 if line_ix[0] == self.bos_ix:
66 |                     line_ix = line_ix[1:]
67 |                 if self.eos_ix in line_ix:
68 |                     line_ix = line_ix[:line_ix.index(self.eos_ix)]
69 |             line = self.sep.join(self.tokens[i] for i in line_ix)
70 |             lines.append(line)
71 |         return lines
72 | 


--------------------------------------------------------------------------------
/week9_policy_II/README.md:
--------------------------------------------------------------------------------
 1 | * [__slides #1 (trpo)__](https://docs.google.com/presentation/d/15Z_AVBsO9VuOSZ5uY-Q4by3tHKiRSENchhAKHhCxIOc/present?token=AC4w5VgM6o7lCOmwtNFI3lfzyPv2PHOpRQ%3A1511795215658&includes_info_params=1#slide=id.g1d8d5bc58c_0_4)
 2 | * [__slides #2 (dpg)__](https://yadi.sk/i/uV6IA-C23UTn7c)
 3 | 
 4 | ## Materials
 5 | This section covers some steroids for policy gradient methods, along with a cool general trick called 
 6 | 
 7 | * Lecture on NPG and TRPO by J. Schulman - [video](https://www.youtube.com/watch?v=_t5fpZuuf-4)
 8 | * Alternative lecture on TRPO and open problems by... J. Schulman - [video](https://www.youtube.com/watch?v=gb5Q2XL5c8A)
 9 | * Our videos: [lecture](https://yadi.sk/i/OP0B1BEj3UcmW9), [seminar(pytorch)](https://yadi.sk/i/D8mHrKM63UcmWh) [seminar(theano)](https://yadi.sk/i/b0ol2gUV3HiKKJ) (russian)
10 | * Original articles - [TRPO](https://arxiv.org/abs/1502.05477), [NPG](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf)
11 | 
12 | ## Practice
13 | Go to `seminar_TRPO_<framework>.ipynb` and follow instructions in the notebook.
14 | 
15 | 
16 | ## More: Reinforcement learning in large/continuous action spaces
17 | While you already know algorithms that will work with continuously many actions, it can't hurt to learn something more specialized.
18 |  * Lecture by J. Schulman - [video](https://www.youtube.com/watch?v=jmMsNQ2eug4)
19 |  * Q-learning with normalized advantage functions - [article](https://arxiv.org/abs/1603.00748), [code1](https://github.com/carpedm20/NAF-tensorflow), [code2](http://bit.ly/2qx2087)
20 |  * Deterministic policy gradient - [article](https://arxiv.org/pdf/1512.07679.pdf), [post+code](https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html)
21 |  * Stochastic value gradient - [article](https://arxiv.org/abs/1510.09142)
22 |  * Embedding large discrete action spaces for RL - [article](https://arxiv.org/pdf/1512.07679.pdf)
23 |  * Lecture by A. Seleznev, 5vision (russian) - [video](www.youtube.com/watch?v=j1L2FnanXPo&t=119m45s)
24 | 
25 | 


--------------------------------------------------------------------------------
/xvfb:
--------------------------------------------------------------------------------
 1 | #taken from https://gist.github.com/jterrace/2911875
 2 | XVFB=/usr/bin/Xvfb
 3 | XVFBARGS=":1 -screen 0 1024x768x24 -ac +extension GLX +render -noreset"
 4 | PIDFILE=./xvfb.pid
 5 | case "$1" in
 6 |   start)
 7 |     echo -n "Starting virtual X frame buffer: Xvfb"
 8 |     start-stop-daemon --start --quiet --pidfile $PIDFILE --make-pidfile --background --exec $XVFB -- $XVFBARGS
 9 |     echo "."
10 |     ;;
11 |   stop)
12 |     echo -n "Stopping virtual X frame buffer: Xvfb"
13 |     start-stop-daemon --stop --quiet --pidfile $PIDFILE
14 |     echo "."
15 |     ;;
16 |   restart)
17 |     $0 stop
18 |     $0 start
19 |     ;;
20 |   *)
21 |         echo "Usage: /etc/init.d/xvfb {start|stop|restart}"
22 |         exit 1
23 | esac
24 | 
25 | exit 0
26 | 


--------------------------------------------------------------------------------
/yet_another_week/README.md:
--------------------------------------------------------------------------------
 1 | __Our slides:__ __[inverse/imitation rl](https://yadi.sk/i/ngB_BcNx3UggK6);__ __[multi-agent 101](https://yadi.sk/i/XrFgmdCy3Vtd4k);__ __[multi-agent 102](https://docs.google.com/presentation/d/1AiSZnWGHWU_34QZ0fqdCGIXWUX_bC-0zz3kc3VVcPCA/edit?usp=sharing);__ __[hierarchical rl](https://yadi.sk/i/LkNiKxMz3Vtcr3)__
 2 | 
 3 | 
 4 | In this week you can find several sections covering advanced topics in RL, along with less advanced topics that we couldn't squeeze into the main track
 5 | 
 6 | ## Other
 7 | * Learning by imitation - [video](https://www.youtube.com/watch?v=kl_G95uKTHw), [assignment](http://rll.berkeley.edu/deeprlcourse/docs/hw1.pdf)(berkeley cs294)
 8 | * Inverse reinforcement learning 
 9 |   * Lecture by Chelsea Finn - [video](https://www.youtube.com/watch?v=d9DlQSJQAoI)
10 |   * Udacity videos - [video](https://www.youtube.com/watch?v=h7uGyBcIeII)
11 | * Distributional RL - [video](https://www.youtube.com/watch?v=bsuvM1jO-4w)
12 | * Knowledge transfer in RL - [video](https://www.youtube.com/watch?v=Hx4XpVdJOI0)(berkeley cs294)
13 | * Hierarchical reinforcemnt learning 
14 |   * Cool article ( Fe__U__dal networks ) - [arxiv](https://arxiv.org/abs/1703.01161)
15 |   * Short lecture by Roy Fox - [video](https://www.youtube.com/watch?v=x_QjJry0hTc)
16 | * Multi-Agent reinforcement learning
17 |   * Lecture by Thoro Graepel - role of HRL in AI research - [video](https://www.youtube.com/watch?v=CvL-KV3IBcM)
18 |   * Lecture by Balaraman Ravindran - [video](https://www.youtube.com/watch?v=K5MlmO0UJtI)
19 | 
20 | ## A list of lists
21 | * [awesome_rl](https://github.com/aikorea/awesome-rl/) - a curated list of resources dedicated to reinforcement learning.
22 | * [junhyukoh's list](https://github.com/junhyukoh/deep-reinforcement-learning-papers)
23 | * [muupan's list](https://github.com/muupan/deep-reinforcement-learning-papers)
24 | * Courses:
25 |  * [CS294: deep reinforcement learning](http://rll.berkeley.edu/deeprlcourse/)
26 |  * [Silver's RL course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
27 |  * [Sutton's book, 2nd edition](http://incompleteideas.net/sutton/book/the-book-2nd.html)
28 | * [Implementations of many basic RL algorithms (raw and/or tensorflow)](https://github.com/dennybritz/reinforcement-learning)
29 | * Reddit: [General ML](https://www.reddit.com/r/MachineLearning/), [RL](https://www.reddit.com/r/reinforcementlearning/), [CS294](https://www.reddit.com/r/berkeleydeeprlcourse/)
30 | * [This great link you could have contributed]
31 | 
32 | 


--------------------------------------------------------------------------------
/yet_another_week/_resource/README.md:
--------------------------------------------------------------------------------
1 | This is a utility folder to store images and other resources used in notebooks.
2 | 


--------------------------------------------------------------------------------
/yet_another_week/_resource/a3c_scheme.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/a3c_scheme.odp


--------------------------------------------------------------------------------
/yet_another_week/_resource/conv_salary_architecture.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/conv_salary_architecture.odp


--------------------------------------------------------------------------------
/yet_another_week/_resource/conv_salary_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/conv_salary_architecture.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/do_something_scst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/do_something_scst.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/dqn_arch.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/dqn_arch.odp


--------------------------------------------------------------------------------
/yet_another_week/_resource/dqn_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/dqn_arch.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/env_pool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/env_pool.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/exp_replay.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/exp_replay.odp


--------------------------------------------------------------------------------
/yet_another_week/_resource/exp_replay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/exp_replay.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/nerd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/nerd.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/nnet_arch.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/nnet_arch.odp


--------------------------------------------------------------------------------
/yet_another_week/_resource/nnet_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/nnet_arch.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/pomdp_arch.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_arch.odp


--------------------------------------------------------------------------------
/yet_another_week/_resource/pomdp_arch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_arch.pdf


--------------------------------------------------------------------------------
/yet_another_week/_resource/pomdp_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_arch.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/pomdp_img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_img1.jpg


--------------------------------------------------------------------------------
/yet_another_week/_resource/pomdp_img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_img2.jpg


--------------------------------------------------------------------------------
/yet_another_week/_resource/pomdp_img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/pomdp_img3.jpg


--------------------------------------------------------------------------------
/yet_another_week/_resource/qlearning_scheme.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/qlearning_scheme.odp


--------------------------------------------------------------------------------
/yet_another_week/_resource/qlearning_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/qlearning_scheme.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/rollout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/rollout.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/scheme.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/target_net.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/target_net.odp


--------------------------------------------------------------------------------
/yet_another_week/_resource/target_net.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/target_net.png


--------------------------------------------------------------------------------
/yet_another_week/_resource/training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sshkhr/Practical_RL/b453e4d1cacf8320af6fee9ee6b35b3d24ae19ec/yet_another_week/_resource/training.png


--------------------------------------------------------------------------------
/youtube_dl_lectures.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #this script utilizes https://github.com/rg3/youtube-dl/ to download __ENGLISH__ lecture materials in the respective folders.
 3 | #you can install youtube-dl via `pip install --upgrade youtube-dl` if you don't have it already.
 4 | #WARNING! the full script downloads gigabytes of mp4!
 5 | 
 6 | #week0
 7 | youtube-dl https://www.youtube.com/watch?v=2pWv7GOvuf0 --output week0_intro/Lecture1_Silver.mp4
 8 | youtube-dl https://www.youtube.com/watch?v=lfHX2hHRMVQ --output week0_intro/Lecture2_Silver_optional.mp4
 9 | 
10 | #week1
11 | youtube-dl https://www.youtube.com/watch?v=aUrX-rP_ss4 --output week1_blackbox/Lecture_Schulman.mp4
12 | 
13 | #week2
14 | youtube-dl https://www.youtube.com/watch?v=Nd1-UUMVfz4 --output week2_value_based/Lecture_Silver.mp4
15 | youtube-dl https://www.youtube.com/watch?v=i0o-ui1N35U --output week2_value_based/Alternative_lecture_Abbeel_part1.mp4
16 | youtube-dl https://www.youtube.com/watch?v=Csiiv6WGzKM --output week2_value_based/Alternative_lecture_Abbeel_part2.mp4
17 | youtube-dl https://www.youtube.com/watch?v=IL3gVyJMmhg --output week2_value_based/Alternative_lecture_Schulman.mp4
18 | 
19 | #week3
20 | youtube-dl https://www.youtube.com/watch?v=PnHCvfgC_ZA --output week3_model_free/Lecture_Silver_part1.mp4
21 | youtube-dl https://www.youtube.com/watch?v=0g4j2k_Ggc4 --output week3_model_free/Lecture_Silver_part2.mp4
22 | youtube-dl https://www.youtube.com/watch?v=ifma8G7LegE --output week3_model_free/Alternative_lecture_Abbeel.mp4
23 | youtube-dl https://www.youtube.com/watch?v=IL3gVyJMmhg --output week3_model_free/Alternative_lecture_Schulmann.mp4
24 | 
25 | #week3.5
26 | youtube-dl https://www.youtube.com/watch?v=uXt8qF2Zzfo --output week4_\[recap\]_deep_learning/Lecture_basics.mp4
27 | youtube-dl https://www.youtube.com/watch?v=FmpDIaiMIeA --output week4_\[recap\]_deep_learning/Lecture_convnets.mp4
28 | youtube-dl https://www.youtube.com/watch?v=OU8I1oJ9HhI --output week4_\[recap\]_deep_learning/Tutorial_theano.mp4
29 | 
30 | #week4
31 | youtube-dl https://www.youtube.com/watch?v=UoPei5o4fps --output week4_approx_rl/Lecture_Silver.mp4
32 | youtube-dl https://www.youtube.com/watch?v=h1-pj4Y9-kM --output week4_approx_rl/Lecture_Schulman.mp4
33 | 
34 | #week5
35 | youtube-dl https://www.youtube.com/watch?v=sGuiWX07sKw --output week5_explore/Lecture_Silver.mp4
36 | youtube-dl https://www.youtube.com/watch?v=SfCa1HQMkuw --output week5_explore/Lecture_Schulmann.mp4
37 | 
38 | #week6
39 | youtube-dl https://www.youtube.com/watch?v=KHZVXao4qXs --output week6_policy_based/Lecture_Silver.mp4
40 | youtube-dl https://www.youtube.com/watch?v=BB-BhTn6DCM --output week6_policy_based/Alternative_lecture_Schulman_part1.mp4
41 | youtube-dl  https://www.youtube.com/watch?v=Wnl-Qh2UHGg --output week6_policy_based/Alternative_lecture_Schulman_part2.mp4
42 | 
43 | #week6.5
44 | youtube-dl https://www.youtube.com/watch?v=iX5V1WpxxkY --output week7_\[recap\]_rnn/Lecture_cs231.mp4
45 | youtube-dl https://www.youtube.com/watch?v=Ukgii7Yd_cU --output week7_\[recap\]_rnn/Alternative_lecture_nervana.mp4
46 | youtube-dl https://www.youtube.com/watch?v=xK-bzjIQkmM --output week7_\[recap\]_rnn/Alternative_lecture_Bengio.mp4
47 | youtube-dl https://www.youtube.com/watch?v=G5RY_SUJih4 --output week7_\[recap\]_rnn/Bonus_lecture_seq2seq.mp4
48 | 
49 | #week7
50 | youtube-dl https://www.youtube.com/watch?v=yCqPMD6coO8 --output week7_pomdp/Lecture_Ng.mp4
51 | 
52 | #week8
53 | #TODO
54 | 
55 | #week9
56 | youtube-dl https://www.youtube.com/watch?v=_t5fpZuuf-4 --output week9_policy_II/Lecture_Schulmann.mp4
57 | 
58 | 


--------------------------------------------------------------------------------