├── week2
    ├── assignment
    │   ├── run_crawler.sh
    │   ├── layouts
    │   │   ├── trappedClassic.lay
    │   │   ├── minimaxClassic.lay
    │   │   ├── smallGrid.lay
    │   │   ├── mediumGrid.lay
    │   │   ├── testClassic.lay
    │   │   ├── capsuleClassic.lay
    │   │   ├── smallClassic.lay
    │   │   ├── contestClassic.lay
    │   │   ├── openClassic.lay
    │   │   ├── mediumClassic.lay
    │   │   ├── trickyClassic.lay
    │   │   └── originalClassic.lay
    │   ├── run_grid.sh
    │   ├── how2run
    │   ├── run_pacman.sh
    │   ├── environment.py
    │   ├── pacmanAgents.py
    │   ├── analysis.py
    │   ├── textDisplay.py
    │   ├── mdp.py
    │   ├── keyboardAgents.py
    │   ├── ghostAgents.py
    │   ├── featureExtractors.py
    │   ├── qlearningAgents.py
    │   ├── layout.py
    │   ├── learningAgents.py
    │   ├── graphicsCrawlerDisplay.py
    │   ├── graphicsUtils.py
    │   ├── textGridworldDisplay.py
    │   └── graphicsGridworldDisplay.py
    ├── homework_tips.md
    ├── alternative
    │   └── qlearning.py
    └── README.md
├── week9
    ├── all_states.npy
    ├── action_rewards.npy
    ├── README.md
    └── bayes.py
├── docker
    ├── run_jupyter.sh
    ├── README.md
    └── Dockerfile
├── xvfb
├── Amazon GPU howto.md
├── week3
    ├── README.md
    ├── sarsa.py
    ├── qlearning.py
    └── expected_value_sarsa.py
├── .gitignore
├── LICENSE.md
├── week0
    └── README.md
├── week1
    ├── README.md
    └── breakout.py
├── Dockerfile
├── week6.5
    └── README.md
├── week6
    └── README.md
├── week7
    ├── rockpaperscissors.py
    └── README.md
├── youtube_dl_lectures.sh
├── week8
    ├── README.md
    └── 8.2_bonus.ipynb
├── week3.5
    ├── mnist.py
    ├── README.md
    ├── fix_my_nn.ipynb
    └── Seminar3.5-approx-qlearning.ipynb
├── week5
    └── README.md
├── week4
    ├── README.md
    └── Seminar4.0_recap_approx_qlearning.ipynb
└── yet_another_week
    └── README.md


/week2/assignment/run_crawler.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python crawler.py
3 | 
4 | 


--------------------------------------------------------------------------------
/week9/all_states.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Practical_RL/master/week9/all_states.npy


--------------------------------------------------------------------------------
/week2/assignment/layouts/trappedClassic.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%
2 | %   P G%
3 | %G%%%%%%
4 | %....  %
5 | %%%%%%%%
6 | 


--------------------------------------------------------------------------------
/week2/assignment/run_grid.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python gridworld.py -a q -k 100 -n 0 -g BookGrid -e 0.5
3 | 


--------------------------------------------------------------------------------
/week9/action_rewards.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/Practical_RL/master/week9/action_rewards.npy


--------------------------------------------------------------------------------
/docker/run_jupyter.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | jupyter notebook --no-browser --allow-root --ip 0.0.0.0
3 | 
4 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/minimaxClassic.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%%
2 | %.P    G% 
3 | % %.%G%%%  
4 | %G    %%% 
5 | %%%%%%%%%
6 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/smallGrid.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%
2 | % P   %
3 | % %%% %
4 | % %.  %
5 | % %%% %
6 | %. G  %
7 | %%%%%%%
8 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/mediumGrid.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%
2 | %P     %
3 | % .% . %
4 | %  %   %
5 | % .% . %
6 | %     G%
7 | %%%%%%%%
8 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/testClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%
 2 | % . %
 3 | %.G.%
 4 | % . %
 5 | %. .%
 6 | %   %
 7 | %  .%
 8 | %   %
 9 | %P .%
10 | %%%%%
11 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/capsuleClassic.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%%%%%%%%%%%%
2 | %G.       G   ....%
3 | %.% % %%%%%% %.%%.%
4 | %.%o% %   o% %.o%.%
5 | %.%%%.%  %%% %..%.%
6 | %.....  P    %..%G%
7 | %%%%%%%%%%%%%%%%%%%%
8 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/smallClassic.lay:
--------------------------------------------------------------------------------
1 | %%%%%%%%%%%%%%%%%%%%
2 | %......%G  G%......%
3 | %.%%...%%  %%...%%.%
4 | %.%o.%........%.o%.%
5 | %.%%.%.%%%%%%.%.%%.%
6 | %........P.........%
7 | %%%%%%%%%%%%%%%%%%%%
8 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/contestClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%
 2 | %o...%........%...o%
 3 | %.%%.%.%%..%%.%.%%.%
 4 | %...... G GG%......%
 5 | %.%.%%.%% %%%.%%.%.%
 6 | %.%....% ooo%.%..%.%
 7 | %.%.%%.% %% %.%.%%.%
 8 | %o%......P....%....%
 9 | %%%%%%%%%%%%%%%%%%%%
10 | 


--------------------------------------------------------------------------------
/week2/assignment/how2run:
--------------------------------------------------------------------------------
1 | python pacman.py -p PacmanQAgent -x 5000 -n 5010 -l smallGrid
2 | python pacman.py -p PacmanQAgent -x 10000 -n 10010 -l mediumGrid
3 | python pacman.py -p PacmanQAgent -x 100 -n 110 -l mediumClassic
4 | python gridworld.py -a q -k 50 -n 0 -g BridgeGrid -e 1
5 | python crawler.py
6 | 
7 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/openClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%%%%%%
 2 | %.. P  ....      ....   %
 3 | %..  ...  ...  ...  ... %
 4 | %..  ...  ...  ...  ... %
 5 | %..    ....      .... G %
 6 | %..  ...  ...  ...  ... %
 7 | %..  ...  ...  ...  ... %
 8 | %..    ....      ....  o%
 9 | %%%%%%%%%%%%%%%%%%%%%%%%%
10 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/mediumClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%
 2 | %o...%........%....%
 3 | %.%%.%.%%%%%%.%.%%.%
 4 | %.%..............%.%
 5 | %.%.%%.%%  %%.%%.%.%
 6 | %......%G  G%......%
 7 | %.%.%%.%%%%%%.%%.%.%
 8 | %.%..............%.%
 9 | %.%%.%.%%%%%%.%.%%.%
10 | %....%...P....%...o%
11 | %%%%%%%%%%%%%%%%%%%%
12 | 


--------------------------------------------------------------------------------
/week2/assignment/run_pacman.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python pacman.py -p PacmanQAgent -x 1000 -n 1010 -l smallGrid
3 | 
4 | # -x is the amount of training epochs, -n is the total amount of epochs.
5 | # hence, if you want to spend 1337 epochs training and then play 42 for evalution, you will need
6 | # python pacman.py -p PacmanQAgent -x 1337 -n 1379 -l smallGrid
7 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/trickyClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%
 2 | %o...%........%...o%
 3 | %.%%.%.%%..%%.%.%%.%
 4 | %.%.....%..%.....%.%
 5 | %.%.%%.%%  %%.%%.%.%
 6 | %...... GGGG%.%....%
 7 | %.%....%%%%%%.%..%.%
 8 | %.%....%  oo%.%..%.%
 9 | %.%....% %%%%.%..%.%
10 | %.%...........%..%.%
11 | %.%%.%.%%%%%%.%.%%.%
12 | %o...%...P....%...o%
13 | %%%%%%%%%%%%%%%%%%%%
14 | 


--------------------------------------------------------------------------------
/xvfb:
--------------------------------------------------------------------------------
 1 | #taken from https://gist.github.com/jterrace/2911875
 2 | XVFB=/usr/bin/Xvfb
 3 | XVFBARGS=":1 -screen 0 1024x768x24 -ac +extension GLX +render -noreset"
 4 | PIDFILE=./xvfb.pid
 5 | case "$1" in
 6 |   start)
 7 |     echo -n "Starting virtual X frame buffer: Xvfb"
 8 |     start-stop-daemon --start --quiet --pidfile $PIDFILE --make-pidfile --background --exec $XVFB -- $XVFBARGS
 9 |     echo "."
10 |     ;;
11 |   stop)
12 |     echo -n "Stopping virtual X frame buffer: Xvfb"
13 |     start-stop-daemon --stop --quiet --pidfile $PIDFILE
14 |     echo "."
15 |     ;;
16 |   restart)
17 |     $0 stop
18 |     $0 start
19 |     ;;
20 |   *)
21 |         echo "Usage: /etc/init.d/xvfb {start|stop|restart}"
22 |         exit 1
23 | esac
24 | 
25 | exit 0
26 | 


--------------------------------------------------------------------------------
/Amazon GPU howto.md:
--------------------------------------------------------------------------------
 1 | # How to set up GPU on EC2 instance
 2 | 
 3 | ## Create EC2 instance
 4 | 
 5 | Use `p2.xlarge` instance type and `ami-e00a8180` AMI image. [Details](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html)
 6 | 
 7 | Open ports `22` (ssh) and `80` (http) on your freshly created instance, 
 8 | you create a [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) 
 9 | and attach it your instance to get ports open
10 | 
11 | ## Launch notebook
12 | 
13 | Instance you have created contains all you need: fresh versions of theano, lasagne, CUDA driver and cuDNN, 
14 | just lunch ipython and get hands dirty:
15 | 
16 | ```bash
17 | $ sudo su
18 | $ export THEANO_FLAGS='cuda.root=/usr/local/cuda,device=gpu,floatX=float32'
19 | $ export PATH=/usr/local/cuda-8.0/bin${PATH:+:${PATH}}
20 | $ jupyter notebook
21 | ```
22 | 
23 | 


--------------------------------------------------------------------------------
/week2/assignment/layouts/originalClassic.lay:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | %............%%............%
 3 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
 4 | %o%%%%.%%%%%.%%.%%%%%.%%%%o%
 5 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
 6 | %..........................%
 7 | %.%%%%.%%.%%%%%%%%.%%.%%%%.%
 8 | %.%%%%.%%.%%%%%%%%.%%.%%%%.%
 9 | %......%%....%%....%%......%
10 | %%%%%%.%%%%% %% %%%%%.%%%%%%
11 | %%%%%%.%%%%% %% %%%%%.%%%%%%
12 | %%%%%%.%            %.%%%%%%
13 | %%%%%%.% %%%%  %%%% %.%%%%%%
14 | %     .  %G  GG  G%  .     %
15 | %%%%%%.% %%%%%%%%%% %.%%%%%%
16 | %%%%%%.%            %.%%%%%%
17 | %%%%%%.% %%%%%%%%%% %.%%%%%%
18 | %............%%............%
19 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
20 | %.%%%%.%%%%%.%%.%%%%%.%%%%.%
21 | %o..%%.......  .......%%..o%
22 | %%%.%%.%%.%%%%%%%%.%%.%%.%%%
23 | %%%.%%.%%.%%%%%%%%.%%.%%.%%%
24 | %......%%....%%....%%......%
25 | %.%%%%%%%%%%.%%.%%%%%%%%%%.%
26 | %.............P............%
27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%
28 | 


--------------------------------------------------------------------------------
/week3/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [__Lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture3.pdf&name=lecture3.pdf&c=58a61e4fdc8b)
 3 | * [Video 1](https://www.youtube.com/watch?v=PnHCvfgC_ZA) by D.Silver (english) (same as last week)
 4 | * [Video 2](https://www.youtube.com/watch?v=0g4j2k_Ggc4&t=43s) by D. Silver (english)
 5 | * Our [lecture](https://yadi.sk/i/I7XcP6vU3ExNrT), [seminar](https://yadi.sk/i/XbqNQmjm3ExNsq) (russian)
 6 | * Blog post on q-learning Vs SARSA - [url](https://studywolf.wordpress.com/2013/07/01/reinforcement-learning-sarsa-vs-q-learning/)
 7 | 
 8 | ## More materials
 9 | * Eligibility traces from Sutton's book - [url](http://incompleteideas.net/sutton/book/ebook/node72.html)
10 | 
11 | 
12 | ## Homework description
13 | 
14 | Go to [the notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week3/homework3.ipynb) and follow instructions from there.
15 | 
16 | You will have to modify a few .py files in the meantime (e.g. sarsa.py).
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # node and NPM
 2 | npm-debug.log
 3 | node_modules
 4 | ..bfg-report
 5 | 
 6 | # swap files
 7 | *~
 8 | *.swp
 9 | 
10 | 
11 | 
12 | env.sh
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | 
17 | # C extensions
18 | *.so
19 | 
20 | # Distribution / packaging
21 | .Python
22 | env/
23 | bin/
24 | build/
25 | develop-eggs/
26 | dist/
27 | eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg/
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | 
49 | # Translations
50 | *.mo
51 | 
52 | # Mr Developer
53 | .mr.developer.cfg
54 | .project
55 | .pydevproject
56 | .idea
57 | .ipynb_checkpoints
58 | 
59 | # Rope
60 | .ropeproject
61 | 
62 | # Django stuff:
63 | *.log
64 | *.pot
65 | 
66 | # Sphinx documentation
67 | docs/_build/
68 | docs/tmp*
69 | 
70 | # OS X garbage
71 | .DS_Store
72 | 
73 | # Debian things
74 | debian/reproducible-experiment-platform
75 | debian/files
76 | *.substvars
77 | *.debhelper.log
78 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Yandex School of Data Analysis and contributors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/week9/README.md:
--------------------------------------------------------------------------------
 1 | [this week is still largely under construction]
 2 | ## Exploration and exploitation
 3 | * [__main__] David Silver lecture on exploration and expoitation - [video](https://www.youtube.com/watch?v=sGuiWX07sKw)
 4 | * Alternative lecture by J. Schulman - [video](https://www.youtube.com/watch?v=SfCa1HQMkuw)
 5 | * Our lecture (russian) - [slides](https://yadi.sk/i/JAeItALT3JmvCL), [video](https://yadi.sk/i/bVHmu9gt3Hi9Ym)
 6 | * Our lecture on exploration with bayesian neural networks - [slides](https://yadi.sk/i/OANpkyFn3Jmv4J)
 7 |   
 8 | ## More materials 
 9 | * "Deep" version: variational information maximizing exploration - [video](https://www.youtube.com/watch?v=sRIjxxjVrnY)
10 |   * Same topics in russian - [video](https://yadi.sk/i/_2_0yqeW3HDbcn)
11 | * Lecture covering intrinsically motivated reinforcement learning - https://www.youtube.com/watch?v=aJI_9SoBDaQ
12 |   * [Slides](https://yadi.sk/i/8sx42nau3HEYKg)
13 |   * Same topics in russian - https://www.youtube.com/watch?v=WCE9hhPbCmc
14 |   * Note: UCB-1 is not for bernoulli rewards, but for arbitrary r in [0,1], so you can just scale any reward to [0,1] to obtain a peace of mind. It's derived directly from Hoeffding's inequality.
15 | 
16 | ## Seminar
17 | In this seminar, you'll be implementing Bayesian UCB and Thompson Sampling for a contextual bandit based on Bayesian Neural Networks. Everything's in the notebook!
18 | 


--------------------------------------------------------------------------------
/week0/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ## Materials
 4 | * [__Lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture0.pdf&name=lecture0.pdf&c=58a61e7da325)
 5 | * __[main]__ Video-intro by David Silver (english) - https://www.youtube.com/watch?v=2pWv7GOvuf0
 6 | * Optional lecture by David Silver (english) - https://www.youtube.com/watch?v=lfHX2hHRMVQ
 7 | * Intro lecture (russian) - https://yadi.sk/i/bMo0qa-x3DoqkS
 8 | * Intro seminar (russian) - https://yadi.sk/i/IBq2MjoS3DoqkY
 9 | * Deep learning course (if you want to learn in parallel) - https://github.com/yandexdataschool/HSE_deeplearning
10 | 
11 | ## Metaheuristics for optimization
12 | * __[recommended]__ - awesome openai post about evolution strategies - [blog post](https://blog.openai.com/evolution-strategies/), [article](https://arxiv.org/abs/1703.03864)
13 | * Guide to genetic algorithms (english) - https://www.youtube.com/watch?v=ejxfTy4lI6I
14 | * Another guide to genetic algorithm (english) - https://www.youtube.com/watch?v=zwYV11a__HQ
15 | * PDF on Differential evolution (english) - http://jvanderw.une.edu.au/DE_1.pdf
16 | * Video on Ant Colony Algorithm (english) - https://www.youtube.com/watch?v=D58nLNLkb0I
17 | * Longer video on Ant Colony Algorithm (english) - https://www.youtube.com/watch?v=xpyKmjJuqhk
18 | 
19 | 
20 | ## Homework description
21 | * Go to the [notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week0/frozenlake.ipynb)
22 | * You can find homework and bonus assignment descriptions at the end of that notebook.
23 | 


--------------------------------------------------------------------------------
/week2/assignment/environment.py:
--------------------------------------------------------------------------------
 1 | # environment.py
 2 | # --------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | #!/usr/bin/python
10 | 
11 | class Environment:
12 |         
13 |   def getCurrentState(self):
14 |     """
15 |     Returns the current state of enviornment
16 |     """
17 |     abstract
18 |     
19 |   def getPossibleActions(self, state):
20 |     """
21 |       Returns possible actions the agent 
22 |       can take in the given state. Can
23 |       return the empty list if we are in 
24 |       a terminal state.
25 |     """
26 |     abstract
27 |                 
28 |   def doAction(self, action):
29 |     """
30 |       Performs the given action in the current
31 |       environment state and updates the enviornment.
32 |     
33 |       Returns a (reward, nextState) pair
34 |     """
35 |     abstract
36 |         
37 |   def reset(self):
38 |     """
39 |       Resets the current state to the start state
40 |     """
41 |     abstract
42 | 
43 |   def isTerminal(self):
44 |     """
45 |       Has the enviornment entered a terminal
46 |       state? This means there are no successors
47 |     """
48 |     state = self.getCurrentState()
49 |     actions = self.getPossibleActions(state)
50 |     return len(actions) == 0
51 |     


--------------------------------------------------------------------------------
/week1/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials:
 2 | * [__Lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture1.pdf&name=lecture1.pdf&c=58a61ec9256c)
 3 | * Lecture and seminar videos (russian) - [lecture](https://yadi.sk/i/5yf_4oGI3EDJhJ), [seminar](https://yadi.sk/i/dPsWYMK13EDJj7) _only covering crossentropy method_
 4 | * [__main__] Lecture by J Schulman with crossentropy method explained (english) - https://www.youtube.com/watch?v=aUrX-rP_ss4&list=PLCTc_C7itk-GaAMxmlChrkPnGKtjz8hv1
 5 | * [__main__] Sutton's definitive guide to monte-carlo methods - http://incompleteideas.net/sutton/book/ebook/node50.html
 6 | * Article about CEM in general - https://people.smp.uq.edu.au/DirkKroese/ps/eormsCE.pdf
 7 | * Article about CEM for optimization - https://people.smp.uq.edu.au/DirkKroese/ps/CEopt.pdf
 8 | * Article about CEM in reinforcement learning - http://www.aaai.org/Papers/ICML/2003/ICML03-068.pdf
 9 | 
10 | ## Homework description
11 | * Just follow the [notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week1/crossentropy_method.ipynb)
12 | * During the `CartPole-v0` section (and similar envs), a window will pop up, displaying some game state. The window won't respond to direct input and is instead changes each time you call env.render(). Don't force-close this process, just ignore it until you complete the notebook.
13 | * __important__ the current newest version of gym force-stops environment in 200 steps even if you don't use env.monitor.
14 |   * This may ruin CEM on MountainCar. To avoid this, use gym.make("MountainCar-v0").env
15 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | To simplify installation process, you can deploy a container (~virtual machine) with all dependencies pre-installed.
 2 | 
 3 | _tl;dr [dockerhub url](https://hub.docker.com/r/justheuristic/practical_rl/)_
 4 | 
 5 | ## Install Docker
 6 | 
 7 | We recommend you to use either native docker (recommended for linux) or kitematic(recommended for windows).
 8 | * Installing [kitematic](https://kitematic.com/), a simple interface to docker (all platforms)
 9 | * Pure docker: Guide for [windows](https://docs.docker.com/docker-for-windows/), [linux](https://docs.docker.com/engine/installation/), or [macOS](https://docs.docker.com/docker-for-mac/).
10 | 
11 | Below are the instructions for both approaches.
12 | 
13 | ## Kitematic
14 | Find justheuristic/practical_rl in the search menu. Download and launch the container.
15 | 
16 | Click on "web preview" screen in the top-right __or__ go to settings, ports and fing at which port your jupyter is located, usually 32***.
17 | 
18 | ## Native
19 | `docker run -it -v <local_dir>:/notebooks -p <local_port>:8888 justheuristic/practical_rl sh ../run_jupyter.sh`
20 | 
21 | `docker run -it -v /Users/mittov/Documents/shad/semester4/:/notebooks -p 8888:8888 justheuristic/practical_rl sh ../run_jupyter.sh`
22 | 
23 | ## Manual
24 | Build container
25 | 
26 | `$ docker build -t rl .`
27 | 
28 | 
29 | Run it
30 | 
31 | `$ docker run --rm -it -v <local_dir>:/notebooks -p <local_port>:8888 dl`
32 | 
33 | examples:
34 | 
35 | `$ docker run --rm -it -v /Users/mittov/Documents/shad/semester4/:/notebooks -p 8888:8888 dl`
36 | 
37 | Copy the token from console and run
38 | http://localhost:8888/?token=<token>
39 | 


--------------------------------------------------------------------------------
/week1/breakout.py:
--------------------------------------------------------------------------------
 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient"""
 2 | import numpy as np
 3 | import gym
 4 | from scipy.misc import imresize
 5 | from gym.core import Wrapper
 6 | from gym.spaces.box import Box
 7 | 
 8 | def make_breakout():
 9 |     """creates breakout env with all preprocessing done for you"""
10 |     return PreprocessAtari(gym.make("BreakoutDeterministic-v0"))
11 | 
12 | class PreprocessAtari(Wrapper):
13 |     def __init__(self,env,height=64,width=64,
14 |                  crop=lambda img: img[34:34+160],n_frames=4):
15 |         """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
16 |         super(PreprocessAtari, self).__init__(env)
17 |         self.img_size = (height,width)
18 |         self.crop=crop
19 |         self.observation_space = Box(0.0, 1.0, [n_frames,height,width])
20 |         self.framebuffer = np.zeros([n_frames,height,width])
21 |     def reset(self):
22 |         """resets breakout, returns initial frames"""
23 |         self.framebuffer = np.zeros_like(self.framebuffer)
24 |         self.update_buffer(self.env.reset())
25 |         return self.framebuffer
26 |     def step(self,action):
27 |         """plays breakout for 1 step, returns 4-frame buffer"""
28 |         new_img,r,done,info = self.env.step(action)
29 |         self.update_buffer(new_img)
30 |         return self.framebuffer,r,done,info
31 |     
32 |     ###image processing###
33 |     
34 |     def update_buffer(self,img):
35 |         img = self.preproc_image(img)
36 |         self.framebuffer = np.vstack([img[None], self.framebuffer[:-1]])
37 | 
38 |     def preproc_image(self, img):
39 |         """what happens to the observation"""
40 |         img = self.crop(img)
41 |         img = imresize(img, self.img_size).mean(-1)
42 |         img = img.astype('float32')/255.
43 |         return img
44 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM andrewosh/binder-base
 2 | MAINTAINER Alexander Panin <justheuristic@gmail.com>
 3 | USER root
 4 | 
 5 | RUN echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" >> /etc/apt/sources.list
 6 | RUN apt-get -qq update
 7 | 
 8 | RUN apt-get install -y gcc g++ wget unzip
 9 | RUN apt-get install -y libopenblas-dev liblapack-dev libsdl2-dev libboost-all-dev 
10 | RUN apt-get install -y cmake zlib1g-dev libjpeg-dev 
11 | RUN apt-get install -y xvfb libav-tools xorg-dev python-opengl
12 | RUN apt-get -y install swig3.0
13 | RUN ln -s /usr/bin/swig3.0 /usr/bin/swig
14 | 
15 | 
16 | USER main
17 | 
18 | RUN pip install --upgrade pip
19 | RUN pip install --upgrade sklearn tqdm
20 | RUN pip install --upgrade gym[all]
21 | RUN pip install --upgrade https://github.com/Theano/Theano/archive/master.zip
22 | RUN pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip
23 | RUN pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip
24 | #RUN pip install --upgrade tensorflow
25 | RUN pip install --upgrade keras
26 | RUN pip install gym_pull
27 | RUN pip install ppaquette-gym-doom
28 | 
29 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade pip
30 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade sklearn tqdm
31 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym[all]
32 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Theano/Theano/archive/master.zip
33 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip
34 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip
35 | #RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade tensorflow
36 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade keras
37 | #TODO py3 doom once it's no longer broken
38 | 


--------------------------------------------------------------------------------
/week2/assignment/pacmanAgents.py:
--------------------------------------------------------------------------------
 1 | # pacmanAgents.py
 2 | # ---------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | from pacman import Directions
10 | from game import Agent
11 | import random
12 | import game
13 | import util
14 | 
15 | class LeftTurnAgent(game.Agent):
16 |   "An agent that turns left at every opportunity"
17 |   
18 |   def getAction(self, state):
19 |     legal = state.getLegalPacmanActions()
20 |     current = state.getPacmanState().configuration.direction
21 |     if current == Directions.STOP: current = Directions.NORTH
22 |     left = Directions.LEFT[current]
23 |     if left in legal: return left
24 |     if current in legal: return current
25 |     if Directions.RIGHT[current] in legal: return Directions.RIGHT[current]
26 |     if Directions.LEFT[left] in legal: return Directions.LEFT[left]
27 |     return Directions.STOP
28 | 
29 | class GreedyAgent(Agent):
30 |   def __init__(self, evalFn="scoreEvaluation"):
31 |     self.evaluationFunction = util.lookup(evalFn, globals())
32 |     assert self.evaluationFunction != None
33 |         
34 |   def getAction(self, state):
35 |     # Generate candidate actions
36 |     legal = state.getLegalPacmanActions()
37 |     if Directions.STOP in legal: legal.remove(Directions.STOP)
38 |       
39 |     successors = [(state.generateSuccessor(0, action), action) for action in legal] 
40 |     scored = [(self.evaluationFunction(state), action) for state, action in successors]
41 |     bestScore = max(scored)[0]
42 |     bestActions = [pair[1] for pair in scored if pair[0] == bestScore]
43 |     return random.choice(bestActions)
44 |   
45 | def scoreEvaluation(state):
46 |   return state.getScore()  


--------------------------------------------------------------------------------
/week2/assignment/analysis.py:
--------------------------------------------------------------------------------
 1 | # analysis.py
 2 | # -----------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | ######################
10 | # ANALYSIS QUESTIONS #
11 | ######################
12 | 
13 | # Change these default values to obtain the specified policies through
14 | # value iteration.
15 | 
16 | def question2a():
17 |   answerDiscount = 0.9
18 |   answerNoise = 0.2
19 |   answerLivingReward = 0.0
20 |   return answerDiscount, answerNoise, answerLivingReward
21 |   # If not possible, return 'NOT POSSIBLE'
22 | 
23 | def question2b():
24 |   answerDiscount = 0.9
25 |   answerNoise = 0.2
26 |   answerLivingReward = 0.0
27 |   return answerDiscount, answerNoise, answerLivingReward
28 |   # If not possible, return 'NOT POSSIBLE'
29 | 
30 | def question2c():
31 |   answerDiscount = 0.9
32 |   answerNoise = 0.2
33 |   answerLivingReward = 0.0
34 |   return answerDiscount, answerNoise, answerLivingReward
35 |   # If not possible, return 'NOT POSSIBLE'
36 | 
37 | def question2d():
38 |   answerDiscount = 0.9
39 |   answerNoise = 0.2
40 |   answerLivingReward = 0.0
41 |   return answerDiscount, answerNoise, answerLivingReward
42 |   # If not possible, return 'NOT POSSIBLE'
43 | 
44 | def question2e():
45 |   answerDiscount = 0.9
46 |   answerNoise = 0.2
47 |   answerLivingReward = 0.0
48 |   return answerDiscount, answerNoise, answerLivingReward
49 |   # If not possible, return 'NOT POSSIBLE'
50 | 
51 | if __name__ == '__main__':
52 |   print 'Answers to analysis questions:'
53 |   import analysis
54 |   for q in [q for q in dir(analysis) if q.startswith('question')]:
55 |     response = getattr(analysis, q)()
56 |     print '  Question %s:\t%s' % (q, str(response))
57 | 


--------------------------------------------------------------------------------
/week2/assignment/textDisplay.py:
--------------------------------------------------------------------------------
 1 | # textDisplay.py
 2 | # --------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | import pacman, time
10 | 
11 | DRAW_EVERY = 1
12 | SLEEP_TIME = 0 # This can be overwritten by __init__
13 | DISPLAY_MOVES = False
14 | QUIET = False # Supresses output
15 | 
16 | class NullGraphics:
17 |   def initialize(self, state, isBlue = False):
18 |     pass
19 |   
20 |   def update(self, state):
21 |     pass
22 |   
23 |   def pause(self):
24 |     time.sleep(SLEEP_TIME)
25 |     
26 |   def draw(self, state):
27 |     print state
28 |   
29 |   def finish(self):
30 |     pass
31 | 
32 | class PacmanGraphics:
33 |   def __init__(self, speed=None):
34 |     if speed != None:
35 |       global SLEEP_TIME
36 |       SLEEP_TIME = speed
37 |   
38 |   def initialize(self, state, isBlue = False):
39 |     self.draw(state)
40 |     self.pause()
41 |     self.turn = 0
42 |     self.agentCounter = 0
43 |     
44 |   def update(self, state):
45 |     numAgents = len(state.agentStates)
46 |     self.agentCounter = (self.agentCounter + 1) % numAgents
47 |     if self.agentCounter == 0:
48 |       self.turn += 1
49 |       if DISPLAY_MOVES:
50 |         ghosts = [pacman.nearestPoint(state.getGhostPosition(i)) for i in range(1, numAgents)]
51 |         print "%4d) P: %-8s" % (self.turn, str(pacman.nearestPoint(state.getPacmanPosition()))),'| Score: %-5d' % state.score,'| Ghosts:', ghosts
52 |       if self.turn % DRAW_EVERY == 0:
53 |         self.draw(state)
54 |         self.pause()
55 |     if state._win or state._lose:
56 |       self.draw(state)
57 |     
58 |   def pause(self):
59 |     time.sleep(SLEEP_TIME)
60 |     
61 |   def draw(self, state):
62 |     print state
63 |   
64 |   def finish(self):
65 |     pass
66 | 


--------------------------------------------------------------------------------
/week2/assignment/mdp.py:
--------------------------------------------------------------------------------
 1 | # mdp.py
 2 | # ------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | import random
10 | 
11 | class MarkovDecisionProcess:
12 |     
13 |   def getStates(self):
14 |     """
15 |     Return a list of all states in the MDP.
16 |     Not generally possible for large MDPs.
17 |     """
18 |     abstract
19 |         
20 |   def getStartState(self):
21 |     """
22 |     Return the start state of the MDP.
23 |     """
24 |     abstract
25 |     
26 |   def getPossibleActions(self, state):
27 |     """
28 |     Return list of possible actions from 'state'.
29 |     """
30 |     abstract
31 |         
32 |   def getTransitionStatesAndProbs(self, state, action):
33 |     """
34 |     Returns list of (nextState, prob) pairs
35 |     representing the states reachable
36 |     from 'state' by taking 'action' along
37 |     with their transition probabilities.  
38 |     
39 |     Note that in Q-Learning and reinforcment
40 |     learning in general, we do not know these
41 |     probabilities nor do we directly model them.
42 |     """
43 |     abstract
44 |         
45 |   def getReward(self, state, action, nextState):
46 |     """
47 |     Get the reward for the state, action, nextState transition.
48 |     
49 |     Not available in reinforcement learning.
50 |     """
51 |     abstract
52 | 
53 |   def isTerminal(self, state):
54 |     """
55 |     Returns true if the current state is a terminal state.  By convention,
56 |     a terminal state has zero future rewards.  Sometimes the terminal state(s)
57 |     may have no possible actions.  It is also common to think of the terminal
58 |     state as having a self-loop action 'pass' with zero reward; the formulations
59 |     are equivalent.
60 |     """
61 |     abstract
62 | 
63 |     
64 | 


--------------------------------------------------------------------------------
/week6.5/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [Slides](https://yadi.sk/i/-Iqdhg483GDyoN)
 3 | * CS231 lecture on RNNs - https://www.youtube.com/watch?v=iX5V1WpxxkY
 4 | * Our [lecture](https://yadi.sk/i/XHmT5hO53GcCKV), [seminar](https://yadi.sk/i/19twHESN3GcGKQ)
 5 | * [alternative] Brief lecture on RNN by nervana - https://www.youtube.com/watch?v=Ukgii7Yd_cU
 6 | * [alternative] More detailed lecture by Y. Bengio - https://www.youtube.com/watch?v=xK-bzjIQkmM
 7 | * Great reading by Karpathy - http://karpathy.github.io/2015/05/21/rnn-effectiveness/
 8 | * LSTM explained in detail by colah - http://colah.github.io/posts/2015-08-Understanding-LSTMs/
 9 | 
10 | ## More materials
11 | * Seq2seq lecture - https://www.youtube.com/watch?v=G5RY_SUJih4
12 | * "Awesome rnn" entry point - https://github.com/kjw0612/awesome-rnn
13 | * OpenAI research on sentiment analysis that sheds some light on what's inside LSTM language model.
14 | 
15 | # Homework description
16 | 
17 | You guessed, two options
18 | 
19 | ### Lasagne option
20 | 
21 | Follow the [first notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week6.5/char_rnn.ipynb) and implement a simple character-level RNN with pure lasagne. The homework part __(4 points)__ is at the very end of that notebook.
22 | 
23 | Proceed with [seq2seq](https://github.com/yandexdataschool/Practical_RL/blob/master/week6.5/seq2seq.ipynb) notebook for the second part of homework assignment __(6 points)__.
24 | 
25 | ### Alternative
26 | 
27 | In this assignment, you will need to implement two things __(pts are same)__:
28 | * A generative RNN model for one of datasets below or for your custom dataset (anything from clickbait to pokemon names)
29 | * A conditional generative model for either [formula]->[common_name] task for molecules dataset below or image captioning [or similar custom dataset].
30 | 
31 | Some helper materials:
32 | * CS231 rnn [assignment](http://cs231n.github.io/assignments2016/assignment3/)
33 | * "Deep models for text and sequences" section of [this course](https://www.udacity.com/course/deep-learning--ud730)
34 | 
35 | 
36 | ### Datasets
37 | - Names: https://github.com/yandexdataschool/HSE_deeplearning/blob/master/week4/names
38 | - Molecules: https://yadi.sk/d/sYZnG5hK33ktL4
39 | - Questions: https://yadi.sk/d/Dn68_NFx3GBSc8
40 | 
41 | 


--------------------------------------------------------------------------------
/week6/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [Slides](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture6.pdf&name=lecture6.pdf&c=58c876c4863a)
 3 | * Video lecture by D. Silver - https://www.youtube.com/watch?v=KHZVXao4qXs
 4 | * Our [lecture](https://yadi.sk/i/I3M09HKQ3GKBiP), [seminar](https://yadi.sk/i/8f9NX_E73GKBkT)
 5 | * Alternative lecture by J. Schulman part 1 - https://www.youtube.com/watch?v=BB-BhTn6DCM
 6 | * Alternative lecture by J. Schulman part 2 - https://www.youtube.com/watch?v=Wnl-Qh2UHGg
 7 | 
 8 | 
 9 | ## More materials
10 | * Generalizing log-derivative trick - http://blog.shakirm.com/2015/11/machine-learning-trick-of-the-day-5-log-derivative-trick/
11 | * Combining policy gradient and q-learning - https://arxiv.org/abs/1611.01626
12 | * Bayesian perspective on why reparameterization & logderivative tricks matter (Vetrov's take) - https://www.sdsj.ru/slides/Vetrov.pdf
13 | 
14 | 
15 | ## Homework
16 | 
17 | As usual, "lasagne way" and "other way"
18 | 
19 | #### Lasagne way
20 | 
21 | First go to Seminar6.0 notebook and implement a vanilla REINFORCE algorithm from scratch. Follow up by playing with advantage actor-critic in Seminar 6.1 - just follow the steps you'll find in the notebook.
22 | 
23 | #### Other way
24 | 
25 | This week's task is to implement REINFORCE on any continuous state space env (simplest being CartPole-v0) and advantage actor-critic on LunarLander-v2.
26 | 
27 | You will find some helpful materials there:
28 | * Tensorflow similar assignment: [cs294 assignment 4](https://github.com/berkeleydeeprlcourse/homework/blob/master/hw4/homework.md)
29 | 
30 | 
31 | _[copy-pasted section]_
32 | 
33 | We recommend you to upload your results to OpenAI gym and fit your solution in a notebook (ipython/torch/r) unless your framework is incompatible with that. In the latter case, please supply us some notes on what code lies where.
34 | 
35 | Again, we recommend you to read the lasagne/agentnet assignments briefly to get the grasp of what parameters to start from.
36 | 
37 | Bonus assignments remain exactly the same as in the first track.
38 | 
39 | Blindly copy-pasting code from any publically available demos will result in us interrogating you about every signifficant line of code to make sure you at least understand (and regret) what you copypasted.
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/week7/rockpaperscissors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Toy game for explaining how to work with POMDPs
 3 | """
 4 | import gym
 5 | from gym import spaces
 6 | from gym.utils import seeding
 7 | import numpy as np
 8 | 
 9 | class RockPaperScissors(gym.Env):
10 |     """
11 |     Rock-paper-scissors game against an imperfect adversary.
12 |     Your opponent operates in sequences of 3-7 actions.
13 |     There are 5 such pre-defined sequences.
14 |     Once enemy finishes his current sequence, he picks next one at random from 5 pre-defined sequences.
15 |     
16 |     Your observation is enemy's last turn:
17 |     - [1,0,0] for rock
18 |     - [0,1,0] for paper
19 |     - [0,0,1] for scissors
20 |     
21 |     This game is a toy environment to play with recurrent networks in RL.
22 |     """
23 |     #codes of rock, papes and scissors respectively
24 |     codes = np.eye(3)
25 |     
26 |     #list of possible sequences
27 |     sequences = (
28 |         (0,1,2,0,1,2),
29 |         (1,0,0,1,1),
30 |         (2,2,2),
31 |         (2,2,1,1,0,0),
32 |         (0,0,1,2,1,0,0)
33 |     )
34 |     #reward for [i-th] action against [j-th] enemy reaction
35 |     reward = (
36 |         # r   p  s
37 |         ( 0, -1, 1), #r
38 |         ( 1,  0,-1), #p
39 |         (-1,  1, 0), #s
40 |     )
41 |     
42 |     def __init__(self):
43 |         self.action_space = spaces.Discrete(3)
44 |         self.observation_space = spaces.Box(0,1,3)
45 |         self.reset()
46 | 
47 |     def get_observation(self):
48 |         return self.codes[self.current_sequence[self.current_position]]
49 |     
50 |     def new_sequence(self):
51 |         self.current_sequence = np.random.choice(self.sequences)
52 |         self.current_position = 0
53 |     
54 |     ###public methods
55 |     def reset(self):
56 |         self.new_sequence()
57 |         return self.get_observation()
58 | 
59 |     def step(self, action):
60 |         assert self.action_space.contains(action)
61 |         
62 |         self.current_position+=1
63 |         if self.current_position >= len(self.current_sequence):
64 |             self.new_sequence()
65 |         
66 |         enemy_action = self.current_sequence[self.current_position]
67 |         reward = self.reward[action][enemy_action]
68 |         return self.get_observation(), reward, False, {}
69 |     
70 |     def render(*args,**kwargs):
71 |         return 0
72 | 
73 | 


--------------------------------------------------------------------------------
/youtube_dl_lectures.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #this script utilizes https://github.com/rg3/youtube-dl/ to download lecture materials in the respective folders.
 3 | #you can install youtube-dl via `pip install --upgrade youtube-dl` if you don't have it already.
 4 | 
 5 | #WARNING! the full script downloads gigabytes of mp4!
 6 | 
 7 | #week0
 8 | youtube-dl https://www.youtube.com/watch?v=2pWv7GOvuf0 --output week0/Lecture1_Silver.mp4
 9 | youtube-dl https://www.youtube.com/watch?v=lfHX2hHRMVQ --output week0/Lecture2_Silver_optional.mp4
10 | 
11 | #week1
12 | youtube-dl https://www.youtube.com/watch?v=aUrX-rP_ss4 --output week1/Lecture_Schulman.mp4
13 | 
14 | #week2
15 | youtube-dl https://www.youtube.com/watch?v=PnHCvfgC_ZA --output week2/Lecture_Silver.mp4
16 | youtube-dl https://www.youtube.com/watch?v=ifma8G7LegE --output week2/Alternative_lecture_Abbeel.mp4
17 | youtube-dl https://www.youtube.com/watch?v=IL3gVyJMmhg --output week2/Alternative_lecture_Schulman.mp4
18 | 
19 | #week3
20 | youtube-dl https://www.youtube.com/watch?v=0g4j2k_Ggc4 --output week3/Lecture_Silver.mp4
21 | 
22 | #week3.5
23 | youtube-dl https://www.youtube.com/watch?v=uXt8qF2Zzfo --output week3.5/Lecture_basics.mp4
24 | youtube-dl https://www.youtube.com/watch?v=FmpDIaiMIeA --output week3.5/Lecture_convnets.mp4
25 | youtube-dl https://www.youtube.com/watch?v=OU8I1oJ9HhI --output week3.5/Tutorial_theano.mp4
26 | 
27 | #week4
28 | youtube-dl https://www.youtube.com/watch?v=UoPei5o4fps --output week4/Lecture_Silver.mp4
29 | 
30 | #week5
31 | youtube-dl https://www.youtube.com/watch?v=h1-pj4Y9-kM --output week5/Lecture_Schulman.mp4
32 | 
33 | #week6
34 | youtube-dl https://www.youtube.com/watch?v=KHZVXao4qXs --output week6/Lecture_Silver.mp4
35 | youtube-dl https://www.youtube.com/watch?v=BB-BhTn6DCM --output week6/Alternative_lecture_Schulman_part1.mp4
36 | youtube-dl  https://www.youtube.com/watch?v=Wnl-Qh2UHGg --output week6/Alternative_lecture_Schulman_part2.mp4
37 | 
38 | #week6.5
39 | youtube-dl https://www.youtube.com/watch?v=iX5V1WpxxkY --output week6.5/Lecture_cs231.mp4
40 | youtube-dl https://www.youtube.com/watch?v=Ukgii7Yd_cU --output week6.5/Alternative_lecture_nervana.mp4
41 | youtube-dl https://www.youtube.com/watch?v=xK-bzjIQkmM --output week6.5/Alternative_lecture_Bengio.mp4
42 | youtube-dl https://www.youtube.com/watch?v=G5RY_SUJih4 --output week6.5/Bonus_lecture_seq2seq.mp4
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/week2/homework_tips.md:
--------------------------------------------------------------------------------
 1 | ### __Pacman features__
 2 | 
 3 | Try to solve larger grids for pacman setup.
 4 | * python pacman.py -p PacmanQAgent -x N_TRAIN_GAMES -n N_TOTAL_GAMES -l __mediumGrid__
 5 | * python pacman.py -p PacmanQAgent -x N_TRAIN_GAMES -n N_TOTAL_GAMES -l __mediumClassic__
 6 | 
 7 | Even if you adjust N_TRAIN_GAMES to 10^5 and N_TOTAL_GAMES to 10^5+100 (100 last games are for test), pacman won't solve those environments
 8 | 
 9 | The problem with those environments is that they have a large amount of unique states. However, you can devise a smaller environment state by choosing different observation parameters, e.g.:
10 |  * distance and direction to nearest ghost
11 |  * where is nearest food
12 |  * 'center of mass' of all food points (and variance, and whatever)
13 |  * is there a wall in each direction
14 |  * and anything else you see fit 
15 |  
16 | Here's how to get this information from [state](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L49),
17 |  * Get pacman position: [state.getPacmanPosition()](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L128)
18 |  * Is there a wall at (x,y)?: [state.hasWall(x,y)](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L189)
19 |  * Get ghost positions: [state.getGhostPositions()](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L144)
20 |  * Get all food positions: [state.getCapsules()](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L153)
21 |  
22 | You can call those methods anywhere you see state.
23 |  * e.g. in [agent.getValue(state)](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py#L52)
24 |  * Defining a function that extracts all features and calling it in [getQValue](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py#L38) and [setQValue](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py#L44) is probably enough.
25 |  * You can also change agent parameters. The simplest way is to hard-code them in [PacmanQAgent](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py#L140)
26 | 
27 | Also, don't forget to optimize ```learning_rate```, ```discount``` and ```epsilon``` params of model, this may also help to solve this env.
28 | 


--------------------------------------------------------------------------------
/week8/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [Slides](https://yadi.sk/i/7TkZUDkt3GoPXE)
 3 | * Our [lecture](https://yadi.sk/i/-U5w4NpJ3H5TWD), [seminar](https://yadi.sk/i/W3N7-6is3H5TWN)
 4 | * The only relevant video-lecture we could find - [video](https://www.youtube.com/watch?v=2tKNpzUvDc4	)
 5 | * Will hopefully record our lecture in english soon!
 6 | * Self-critical sequence traning [original article](https://arxiv.org/abs/1612.00563)
 7 | 
 8 | 
 9 | ## More materials
10 | * An [awesome post](http://distill.pub/2016/augmented-rnns/) explaining attention and long-term memory models.
11 | * [BLEU](http://www.aclweb.org/anthology/P02-1040.pdf) and [CIDEr](https://arxiv.org/pdf/1411.5726.pdf) articles.
12 | * Image captioning
13 |   * MSCOCO captioning [challenge](http://mscoco.org/dataset/#captions-challenge2015)
14 |   * Captioning baseline [notebook](https://github.com/yandexdataschool/HSE_deeplearning/blob/master/week7/captioning_solution_ars.ipynb)
15 | * Other articles on reinforcement learning for natural language: 
16 |   * [task-oriented conversation system](https://arxiv.org/abs/1703.07055)
17 |   * [generating dialogues](https://arxiv.org/abs/1606.01541)
18 |   * [sequential adversarial networks](https://arxiv.org/abs/1609.05473) (a.k.a. SeqGAN)
19 |   * A large overview for machine translation (touching on RL, including RL failures) - [article](https://arxiv.org/abs/1609.08144)
20 |   * How _not_ to evaluate conversation models - [article](https://arxiv.org/abs/1603.08023)
21 | * Overview of other non-games applications ("that article again") - https://arxiv.org/abs/1701.07274
22 | 
23 | ## Homework
24 | 
25 | Homework assignment is described in the [main notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week8/8.1_translation_scst.ipynb).
26 | 
27 | It's kinda lengthy, but fear not, that's mostly due to formatting.
28 | 
29 | __Other frameworks__: as usual, your task remains the same as in the main track:
30 | - Implement or borrow seq2seq model for the same translation task
31 |   * Neat tenworflow [repo](https://github.com/cmusphinx/g2p-seq2seq)
32 |   * __Important__ - this repo uses simplified phoneme dict - make sure you change preprocessing phase to meaningfully compare results.
33 | - Implement self-critical sequence training ( = basic policy gradient with a special baseline, see notebook)
34 | - Beat the baseline (main notebook: step6)
35 |   
36 | Even if you decide to use custom frameworks, it is highly recommended that you reuse evaluation code (e.g. min Levenshtein) from the main notebook to avoid confusion.
37 | 
38 | 


--------------------------------------------------------------------------------
/week3.5/mnist.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import time
 4 | 
 5 | import numpy as np
 6 | 
 7 | __doc__="""taken from https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py"""
 8 | 
 9 | def load_dataset():
10 |     # We first define a download function, supporting both Python 2 and 3.
11 |     if sys.version_info[0] == 2:
12 |         from urllib import urlretrieve
13 |     else:
14 |         from urllib.request import urlretrieve
15 | 
16 |     def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
17 |         print("Downloading %s" % filename)
18 |         urlretrieve(source + filename, filename)
19 | 
20 |     # We then define functions for loading MNIST images and labels.
21 |     # For convenience, they also download the requested files if needed.
22 |     import gzip
23 | 
24 |     def load_mnist_images(filename):
25 |         if not os.path.exists(filename):
26 |             download(filename)
27 |         # Read the inputs in Yann LeCun's binary format.
28 |         with gzip.open(filename, 'rb') as f:
29 |             data = np.frombuffer(f.read(), np.uint8, offset=16)
30 |         # The inputs are vectors now, we reshape them to monochrome 2D images,
31 |         # following the shape convention: (examples, channels, rows, columns)
32 |         data = data.reshape(-1, 1, 28, 28)
33 |         # The inputs come as bytes, we convert them to float32 in range [0,1].
34 |         # (Actually to range [0, 255/256], for compatibility to the version
35 |         # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
36 |         return data / np.float32(256)
37 | 
38 |     def load_mnist_labels(filename):
39 |         if not os.path.exists(filename):
40 |             download(filename)
41 |         # Read the labels in Yann LeCun's binary format.
42 |         with gzip.open(filename, 'rb') as f:
43 |             data = np.frombuffer(f.read(), np.uint8, offset=8)
44 |         # The labels are vectors of integers now, that's exactly what we want.
45 |         return data
46 | 
47 |     # We can now download and read the training and test set images and labels.
48 |     X_train = load_mnist_images('train-images-idx3-ubyte.gz')
49 |     y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
50 |     X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
51 |     y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
52 | 
53 |     # We reserve the last 10000 training examples for validation.
54 |     X_train, X_val = X_train[:-10000], X_train[-10000:]
55 |     y_train, y_val = y_train[:-10000], y_train[-10000:]
56 | 
57 |     # We just return all the arrays in order, as expected in main().
58 |     # (It doesn't matter how we do this as long as we can read them again.)
59 |     return X_train, y_train, X_val, y_val, X_test, y_test
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/week3.5/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Materials
 3 | * [__Lecture slides__](https://yadi.sk/i/yAO2AJ3M3EKP8g)
 4 | * Lecture on deep learning (russian) - https://www.youtube.com/watch?v=8008XQzoUEs
 5 | * Seminar on theano (russian) - https://yadi.sk/i/54STsEBVpubkn
 6 | * Intro to neural nets and backprop (english) - https://www.youtube.com/watch?v=uXt8qF2Zzfo
 7 | * Intro to convnets (english) - https://www.youtube.com/watch?v=FmpDIaiMIeA
 8 | * Theano tutorial from Lamblin (english) - https://www.youtube.com/watch?v=OU8I1oJ9HhI
 9 | 
10 | ## Bonus materials
11 | * Karpathy's course on deep learning (english) - http://cs231n.github.io/
12 | * Nuts and Bolts of deep learning by Andrew Ng (english) - https://www.youtube.com/watch?v=F1ka6a13S9I
13 | * Deep learning demystified - https://www.youtube.com/watch?v=Q9Z20HCPnww
14 | * Karpathy's lecture on deep learning for computer vision - https://www.youtube.com/watch?v=u6aEYuemt0M
15 | * Our humble DL course: [HSE'autumn16](https://github.com/yandexdataschool/HSE_deeplearning), [Skoltech/YSDA'spring16](https://github.com/ddtm/dl-course/) courses on deep learning (english).
16 | * Srsly, just google `"deep learning %s"%s for s in what_you_want_to_know`.
17 | 
18 | ## Homework
19 | 
20 | If you are already familiar with lasagne or you are super-good with tensorflow/pytorch/similar, pick one of the _alternative_ options. Otherwise we highly recommend the first one as we'll need convolutional networks soon enough.
21 | 
22 | * [__recommended__](https://github.com/yandexdataschool/Practical_RL/blob/master/week3.5/Seminar3.5-en-mnist.ipynb) go to Seminar3.5-*-mnist.ipynb and follow the instructions (ends with lasagne MNIST classifier)
23 | 
24 | 
25 | * [__alternative task__](https://github.com/yandexdataschool/Practical_RL/blob/master/week3.5/Seminar3.5-approx-qlearning.ipynb) go to Seminar3.5-approx-q-learning.ipynb and follow the instructions (ends with simple NN for q-learning)
26 | 
27 | * [__alternative frameworks__] 
28 | The equivalent of recommended track would be 
29 | * [tensorflow] learning through this [google course](https://www.udacity.com/course/deep-learning--ud730) from start till "Convolutional neural networks" (inclusive).
30 | * [manual/other] surviving past assignment2 of [cs231](http://cs231n.github.io/)
31 | 
32 | * [__alternative task and frameworks__]
33 | Implement the simple q-learning network that solves `CartPole-v0`. You're not required to implement experience replay / any advanced stuff, just set sgd learning rate to a small enough number (10^-4) and pray that trains smoothly.
34 | 
35 | Here's a convenient translation to tensorflow: [notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week3.5/Seminar3.5-approx-qlearning-tf.ipynb)
36 | 
37 | Agent can maintain low reward for long enough, but it should at least show some progress by the end of the default loop.
38 | 
39 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | LABEL maintainer "Alexander Panin <justheuristic@gmail.com>, Dmitry Mittov <mittov@gmail.com>"
 3 | 
 4 | 
 5 | RUN echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
 6 |     apt-get -qq update && \
 7 |     apt-get install -y cmake \
 8 |                        wget \
 9 |                        unzip \
10 |                        git \
11 |                        zlib1g-dev \
12 |                        libjpeg-dev \
13 |                        xvfb \
14 |                        libav-tools \
15 |                        xorg-dev \
16 |                        python-opengl \
17 |                        swig3.0 \
18 |                        python-dev \
19 |                        python3-dev \
20 |                        python-pip \
21 |                        python3-pip \
22 |                        libopenblas-dev \
23 |                        liblapack-dev \
24 |                        libsdl2-dev \
25 |                        libboost-all-dev \
26 |                        gcc \
27 |                        g++ && \
28 |     ln -s /usr/bin/swig3.0 /usr/bin/swig
29 | 
30 | RUN pip install --upgrade pip \
31 |                            scipy \
32 |                            numpy && \
33 |     pip install --upgrade sklearn \
34 |                            jupyter \
35 |                            tqdm \
36 |                            gym[all] \
37 |                            matplotlib \
38 |                            seaborn && \
39 |     pip install --upgrade https://github.com/Theano/Theano/archive/master.zip \
40 |                            https://github.com/Lasagne/Lasagne/archive/master.zip \
41 |                            https://github.com/yandexdataschool/AgentNet/archive/master.zip \
42 |                            tensorflow \
43 |                            keras     
44 |                            
45 | RUN pip install --upgrade  gym_pull ppaquette-gym-doom
46 | 
47 | 
48 | RUN pip3 install --upgrade pip \
49 |                            scipy \
50 |                            numpy && \
51 |     pip3 install --upgrade sklearn \
52 |                            jupyter \
53 |                            tqdm \
54 |                            gym[all] \
55 |                            matplotlib \
56 |                            seaborn && \
57 |     pip3 install --upgrade https://github.com/Theano/Theano/archive/master.zip \
58 |                            https://github.com/Lasagne/Lasagne/archive/master.zip \
59 |                            https://github.com/yandexdataschool/AgentNet/archive/master.zip \
60 |                            tensorflow \
61 |                            keras && \                           
62 |     python3 -m ipykernel.kernelspec
63 | 
64 | 
65 | EXPOSE 8888
66 | VOLUME /notebooks
67 | WORKDIR /notebooks
68 | 
69 | COPY run_jupyter.sh /
70 | CMD ["/run_jupyter.sh"]
71 | 


--------------------------------------------------------------------------------
/week7/README.md:
--------------------------------------------------------------------------------
 1 | # Materials
 2 | [lecture slides](https://yadi.sk/d/RGx8BUCr3Gq6DC)
 3 | 
 4 | _Links on all articles mentioned during the lecture could be found in "References" at the very end of the presentation slides. All other interesing links which contribute to the topic of POMDP are presented below_
 5 | 
 6 | ## Basics
 7 | * Our [lecture](https://yadi.sk/i/pMdw-_uI3Gke7Z) and [seminar](https://yadi.sk/i/s1EEuEVd3Gke8k) (russian)
 8 | * A lecture on basics by Andrew NG (english, LQ) - [video](https://www.youtube.com/watch?v=yCqPMD6coO8)
 9 | * A lecture on lecture by 5vision (russian) - [video](https://www.youtube.com/watch?v=_dkaynuKUFE)
10 | * _[alternative]_ Chalkboard-style 2-part lecture by B. Ravindran.  - [part1](https://www.youtube.com/watch?v=9G_KevA8DFY), [part2](https://www.youtube.com/watch?v=dMOUp7YzUpQ)
11 | * _[alternative]_ Yet another mini-lecture touching on POMDP by S.S. Baveja - [video](https://www.youtube.com/watch?v=SE56KgF7aVc)
12 | 
13 | ## POMDP Learning
14 | * DRQN lecture by Fritz448 (russian) - [video](https://www.youtube.com/watch?v=bE5DIJvZexc)
15 | * [Data efficient learning in continous POMDP](https://arxiv.org/abs/1602.02523v1)
16 | * [Managing wind farms with bayesian POMDP](http://ascelibrary.org/doi/abs/10.1061/(ASCE)CP.1943-5487.0000390)
17 | * [Bayesian learning and decision-making in dynamic environments](http://www.jmlr.org/papers/volume12/ross11a/ross11a.pdf)
18 | 
19 | ## POMDP Planning
20 | * [Introduction to planning in POMDP, ch.: 6](https://www.amazon.com/Decision-Making-Under-Uncertainty-Application/dp/0262029251)
21 | * [Bayes filters in robotics, ch.: 3, 4](https://docs.ufpr.br/~danielsantos/ProbabilisticRobotics.pdf)
22 | * SOTA in scalable approximate __offline__ planning:  [SARSOP](http://www.roboticsproceedings.org/rss04/p9.pdf) and [PLEASE](http://www.aaai.org/ocs/index.php/SOCS/SOCS15/paper/viewFile/10686/10627) which is build on top of the former
23 | * SOTA in scalable approximate __online__ planning: [DESPOT](https://arxiv.org/pdf/1609.03250v1.pdf)
24 | * Not SOTA but very useful and enlightening __online__ planning approach: [POMCP](https://papers.nips.cc/paper/4031-monte-carlo-planning-in-large-pomdps.pdf)
25 | * [Realizations of SARSOP, DESPOT and MCVI in C++](http://bigbird.comp.nus.edu.sg/pmwiki/farm/appl/)
26 | * Recent approaches combining  POMDP planning with learning on top of neural networks: [Predictron](https://openreview.net/pdf?id=BkJsCIcgl), [historgram filter](https://openreview.net/pdf?id=ByvJuTigl) and [QMDP-Net](https://arxiv.org/pdf/1703.06692.pdf)
27 | 
28 | 
29 | 
30 | 
31 | ---
32 | 
33 | # Homework
34 | 
35 | We have a detailed description of the entire lab in the [homework notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week7/7.3_homework.ipynb)
36 | 
37 | Homework is platform and framewerk independent, so choose the ones which suit you best, but pay attention on how many you will need to implement youself in case of nonstandart ones.
38 | 


--------------------------------------------------------------------------------
/week2/assignment/keyboardAgents.py:
--------------------------------------------------------------------------------
 1 | # keyboardAgents.py
 2 | # -----------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | from game import Agent
10 | from game import Directions
11 | import random
12 | 
13 | class KeyboardAgent(Agent):
14 |   """
15 |   An agent controlled by the keyboard.
16 |   """
17 |   # NOTE: Arrow keys also work.
18 |   WEST_KEY  = 'a' 
19 |   EAST_KEY  = 'd' 
20 |   NORTH_KEY = 'w' 
21 |   SOUTH_KEY = 's'
22 |   STOP_KEY = 'q'
23 | 
24 |   def __init__( self, index = 0 ):
25 |     
26 |     self.lastMove = Directions.STOP
27 |     self.index = index
28 |     self.keys = []
29 |     
30 |   def getAction( self, state):
31 |     from graphicsUtils import keys_waiting
32 |     from graphicsUtils import keys_pressed
33 |     keys = keys_waiting() + keys_pressed()
34 |     if keys != []:
35 |       self.keys = keys
36 |     
37 |     legal = state.getLegalActions(self.index)
38 |     move = self.getMove(legal)
39 |     
40 |     if move == Directions.STOP:
41 |       # Try to move in the same direction as before
42 |       if self.lastMove in legal:
43 |         move = self.lastMove
44 |     
45 |     if (self.STOP_KEY in self.keys) and Directions.STOP in legal: move = Directions.STOP
46 | 
47 |     if move not in legal:
48 |       move = random.choice(legal)
49 |       
50 |     self.lastMove = move
51 |     return move
52 | 
53 |   def getMove(self, legal):
54 |     move = Directions.STOP
55 |     if   (self.WEST_KEY in self.keys or 'Left' in self.keys) and Directions.WEST in legal:  move = Directions.WEST
56 |     if   (self.EAST_KEY in self.keys or 'Right' in self.keys) and Directions.EAST in legal: move = Directions.EAST
57 |     if   (self.NORTH_KEY in self.keys or 'Up' in self.keys) and Directions.NORTH in legal:   move = Directions.NORTH
58 |     if   (self.SOUTH_KEY in self.keys or 'Down' in self.keys) and Directions.SOUTH in legal: move = Directions.SOUTH
59 |     return move
60 |   
61 | class KeyboardAgent2(KeyboardAgent):
62 |   """
63 |   A second agent controlled by the keyboard.
64 |   """
65 |   # NOTE: Arrow keys also work.
66 |   WEST_KEY  = 'j' 
67 |   EAST_KEY  = "l" 
68 |   NORTH_KEY = 'i' 
69 |   SOUTH_KEY = 'k'
70 |   STOP_KEY = 'u'
71 | 
72 |   def getMove(self, legal):
73 |     move = Directions.STOP
74 |     if   (self.WEST_KEY in self.keys) and Directions.WEST in legal:  move = Directions.WEST
75 |     if   (self.EAST_KEY in self.keys) and Directions.EAST in legal: move = Directions.EAST
76 |     if   (self.NORTH_KEY in self.keys) and Directions.NORTH in legal:   move = Directions.NORTH
77 |     if   (self.SOUTH_KEY in self.keys) and Directions.SOUTH in legal: move = Directions.SOUTH
78 |     return move
79 |   
80 |   
81 | 


--------------------------------------------------------------------------------
/week2/assignment/ghostAgents.py:
--------------------------------------------------------------------------------
 1 | # ghostAgents.py
 2 | # --------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | from game import Agent
10 | from game import Actions
11 | from game import Directions
12 | import random
13 | from util import manhattanDistance
14 | import util
15 | 
16 | class GhostAgent( Agent ):
17 |   def __init__( self, index ):
18 |     self.index = index
19 | 
20 |   def getAction( self, state ):
21 |     dist = self.getDistribution(state)
22 |     if len(dist) == 0: 
23 |       return Directions.STOP
24 |     else:
25 |       return util.chooseFromDistribution( dist )
26 |     
27 |   def getDistribution(self, state):
28 |     "Returns a Counter encoding a distribution over actions from the provided state."
29 |     util.raiseNotDefined()
30 | 
31 | class RandomGhost( GhostAgent ):
32 |   "A ghost that chooses a legal action uniformly at random."
33 |   def getDistribution( self, state ):
34 |     dist = util.Counter()
35 |     for a in state.getLegalActions( self.index ): dist[a] = 1.0
36 |     dist.normalize()
37 |     return dist
38 | 
39 | class DirectionalGhost( GhostAgent ):
40 |   "A ghost that prefers to rush Pacman, or flee when scared."
41 |   def __init__( self, index, prob_attack=0.8, prob_scaredFlee=0.8 ):
42 |     self.index = index
43 |     self.prob_attack = prob_attack
44 |     self.prob_scaredFlee = prob_scaredFlee
45 |       
46 |   def getDistribution( self, state ):
47 |     # Read variables from state
48 |     ghostState = state.getGhostState( self.index )
49 |     legalActions = state.getLegalActions( self.index )
50 |     pos = state.getGhostPosition( self.index )
51 |     isScared = ghostState.scaredTimer > 0
52 |     
53 |     speed = 1
54 |     if isScared: speed = 0.5
55 |     
56 |     actionVectors = [Actions.directionToVector( a, speed ) for a in legalActions]
57 |     newPositions = [( pos[0]+a[0], pos[1]+a[1] ) for a in actionVectors]
58 |     pacmanPosition = state.getPacmanPosition()
59 | 
60 |     # Select best actions given the state
61 |     distancesToPacman = [manhattanDistance( pos, pacmanPosition ) for pos in newPositions]
62 |     if isScared:
63 |       bestScore = max( distancesToPacman )
64 |       bestProb = self.prob_scaredFlee
65 |     else:
66 |       bestScore = min( distancesToPacman )
67 |       bestProb = self.prob_attack
68 |     bestActions = [action for action, distance in zip( legalActions, distancesToPacman ) if distance == bestScore]
69 |     
70 |     # Construct distribution
71 |     dist = util.Counter()
72 |     for a in bestActions: dist[a] = bestProb / len(bestActions)
73 |     for a in legalActions: dist[a] += ( 1-bestProb ) / len(legalActions)
74 |     dist.normalize()
75 |     return dist
76 | 


--------------------------------------------------------------------------------
/week5/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * Slides [here](https://yadi.sk/i/P02qoHng3G7oMt)
 3 | * Video lecture (esp. second half) by J. Schulman - https://www.youtube.com/watch?v=h1-pj4Y9-kM
 4 | * Our [lecture](https://yadi.sk/i/yBO0q4mI3GAxYd), [seminar](https://yadi.sk/i/oWC2M5803GAyFB) (russian)
 5 | * Article on dueling DQN - https://arxiv.org/pdf/1511.06581.pdf
 6 | * Article on double DQN - https://arxiv.org/abs/1509.06461
 7 | * Article on prioritized experience replay - https://arxiv.org/abs/1511.05952
 8 | * Video on asynchronuous methods (Mnih) - https://www.youtube.com/watch?v=9sx1_u2qVhQ
 9 | * Article on bootstrap DQN - https://papers.nips.cc/paper/6501-deep-exploration-via-bootstrapped-dqn.pdf, [summary](http://pemami4911.github.io/paper-summaries/2016/08/16/Deep-exploration.html)
10 | 
11 | 
12 | ## More materials
13 | * [recommended] An overview of deep reinforcement learning - https://arxiv.org/pdf/1701.07274v1.pdf
14 | * Reinforcement learning architectures list - https://github.com/5vision/deep-reinforcement-learning-networks
15 | * Building deep q-network from ~scratch (blog) - https://jaromiru.com/2016/09/27/lets-make-a-dqn-theory/
16 | * Another guide guide to DQN from ~scratch (blog) - https://rubenfiszel.github.io/posts/rl4j/2016-08-24-Reinforcement-Learning-and-DQN.html
17 | * Article on asynchronuous methods in deep RL - https://arxiv.org/abs/1602.01783
18 | * Successor representations for reinforcement learning - [article](https://arxiv.org/abs/1606.02396), [video](https://www.youtube.com/watch?v=kNqXCn7K-BM&feature=youtu.be)
19 | * [recap] Slides on basic DQN, including target networks - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
20 | 
21 | 
22 | ## Homework
23 | 
24 | As usual, "lasagne way" and "other way"
25 | 
26 | #### Lasagne way
27 | 
28 | Basically go to [the notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week5/Seminar5_deep_rl.ipynb) and follow what's inside.
29 | 
30 | #### Other way
31 | 
32 | This week's task is to implement (and hopefully compare) target networks, double DQN and/or duelling DQN and training on atari breakout.
33 | 
34 |  * Tensorflow template: [cs294 assignment 3](https://github.com/berkeleydeeprlcourse/homework/tree/master/hw3)
35 | 
36 | Implementing prioritized experience replay or bootstrap dqn or any other cool stuff yields you bonus points. You can also choose a different environment if you have issues with breakout, but don't get too complicated. E.g. your DQN will likely _fail_ on Montezuma Revenge unless you do weird stuff with reward function.
37 | 
38 | We recommend you to upload your results to OpenAI gym and fit your solution in a notebook (ipython/torch/r) unless your framework is incompatible with that. In the latter case, please supply us some notes on what code lies where.
39 | 
40 | Again,we recommend you to read the lasagne/agentnet assignments briefly to get the grasp of what parameters to start from.
41 | 
42 | Bonus assignments remain exactly the same as in the first track.
43 | 
44 | Blindly copy-pasting code from any publically available demos will result in us interrogating you about every signifficant line of code to make sure you at least understand (and regret) what you copypasted.
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/week2/assignment/featureExtractors.py:
--------------------------------------------------------------------------------
 1 | # featureExtractors.py
 2 | # --------------------
 3 | # Licensing Information: Please do not distribute or publish solutions to this
 4 | # project. You are free to use and extend these projects for educational
 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 8 | 
 9 | "Feature extractors for Pacman game states"
10 | 
11 | from game import Directions, Actions
12 | import util
13 | 
14 | class FeatureExtractor:  
15 |   def getFeatures(self, state, action):    
16 |     """
17 |       Returns a dict from features to counts
18 |       Usually, the count will just be 1.0 for
19 |       indicator functions.  
20 |     """
21 |     util.raiseNotDefined()
22 | 
23 | class IdentityExtractor(FeatureExtractor):
24 |   def getFeatures(self, state, action):
25 |     feats = util.Counter()
26 |     feats[(state,action)] = 1.0
27 |     return feats
28 | 
29 | def closestFood(pos, food, walls):
30 |   """
31 |   closestFood -- this is similar to the function that we have
32 |   worked on in the search project; here its all in one place
33 |   """
34 |   fringe = [(pos[0], pos[1], 0)]
35 |   expanded = set()
36 |   while fringe:
37 |     pos_x, pos_y, dist = fringe.pop(0)
38 |     if (pos_x, pos_y) in expanded:
39 |       continue
40 |     expanded.add((pos_x, pos_y))
41 |     # if we find a food at this location then exit
42 |     if food[pos_x][pos_y]:
43 |       return dist
44 |     # otherwise spread out from the location to its neighbours
45 |     nbrs = Actions.getLegalNeighbors((pos_x, pos_y), walls)
46 |     for nbr_x, nbr_y in nbrs:
47 |       fringe.append((nbr_x, nbr_y, dist+1))
48 |   # no food found
49 |   return None
50 | 
51 | class SimpleExtractor(FeatureExtractor):
52 |   """
53 |   Returns simple features for a basic reflex Pacman:
54 |   - whether food will be eaten
55 |   - how far away the next food is
56 |   - whether a ghost collision is imminent
57 |   - whether a ghost is one step away
58 |   """
59 |   
60 |   def getFeatures(self, state, action):
61 |     # extract the grid of food and wall locations and get the ghost locations
62 |     food = state.getFood()
63 |     walls = state.getWalls()
64 |     ghosts = state.getGhostPositions()
65 | 
66 |     features = util.Counter()
67 |     
68 |     features["bias"] = 1.0
69 |     
70 |     # compute the location of pacman after he takes the action
71 |     x, y = state.getPacmanPosition()
72 |     dx, dy = Actions.directionToVector(action)
73 |     next_x, next_y = int(x + dx), int(y + dy)
74 |     
75 |     # count the number of ghosts 1-step away
76 |     features["#-of-ghosts-1-step-away"] = sum((next_x, next_y) in Actions.getLegalNeighbors(g, walls) for g in ghosts)
77 | 
78 |     # if there is no danger of ghosts then add the food feature
79 |     if not features["#-of-ghosts-1-step-away"] and food[next_x][next_y]:
80 |       features["eats-food"] = 1.0
81 |     
82 |     dist = closestFood((next_x, next_y), food, walls)
83 |     if dist is not None:
84 |       # make the distance a number less than one otherwise the update
85 |       # will diverge wildly
86 |       features["closest-food"] = float(dist) / (walls.width * walls.height) 
87 |     features.divideAll(10.0)
88 |     return features


--------------------------------------------------------------------------------
/week2/alternative/qlearning.py:
--------------------------------------------------------------------------------
  1 | # qlearningAgents.py
  2 | # ------------------
  3 | ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  4 | 
  5 | import random,math
  6 | 
  7 | import numpy as np
  8 | from collections import defaultdict
  9 | 
 10 | class QLearningAgent():
 11 |   """
 12 |     Q-Learning Agent
 13 | 
 14 |     Instance variables you have access to
 15 |       - self.epsilon (exploration prob)
 16 |       - self.alpha (learning rate)
 17 |       - self.discount (discount rate aka gamma)
 18 | 
 19 |     Functions you should use
 20 |       - self.getLegalActions(state)
 21 |         which returns legal actions for a state
 22 |       - self.getQValue(state,action)
 23 |         which returns Q(state,action)
 24 |       - self.setQValue(state,action,value)
 25 |         which sets Q(state,action) := value
 26 |     
 27 |     !!!Important!!!
 28 |     NOTE: please avoid using self._qValues directly to make code cleaner
 29 |   """
 30 |   def __init__(self,alpha,epsilon,discount,getLegalActions):
 31 |     "We initialize agent and Q-values here."
 32 |     self.getLegalActions= getLegalActions
 33 |     self._qValues = defaultdict(lambda:defaultdict(lambda:0))
 34 |     self.alpha = alpha
 35 |     self.epsilon = epsilon
 36 |     self.discount = discount
 37 | 
 38 |   def getQValue(self, state, action):
 39 |     """
 40 |       Returns Q(state,action)
 41 |     """
 42 |     return self._qValues[state][action]
 43 | 
 44 |   def setQValue(self,state,action,value):
 45 |     """
 46 |       Sets the Qvalue for [state,action] to the given value
 47 |     """
 48 |     self._qValues[state][action] = value
 49 | 
 50 | #---------------------#start of your code#---------------------#
 51 | 
 52 |   def getValue(self, state):
 53 |     """
 54 |       Returns max_action Q(state,action)
 55 |       where the max is over legal actions.
 56 |     """
 57 |     
 58 |     possibleActions = self.getLegalActions(state)
 59 |     #If there are no legal actions, return 0.0
 60 |     if len(possibleActions) == 0:
 61 |     	return 0.0
 62 | 
 63 |     "*** YOUR CODE HERE ***"
 64 |     return <compute state value>
 65 |     
 66 |   def getPolicy(self, state):
 67 |     """
 68 |       Compute the best action to take in a state. 
 69 |       
 70 |     """
 71 |     possibleActions = self.getLegalActions(state)
 72 | 
 73 |     #If there are no legal actions, return None
 74 |     if len(possibleActions) == 0:
 75 |     	return None
 76 |     
 77 |     best_action = None
 78 | 
 79 |     "*** YOUR CODE HERE ***"
 80 |     best_action = <your code>
 81 |     return best_action
 82 | 
 83 |   def getAction(self, state):
 84 |     """
 85 |       Compute the action to take in the current state, including exploration.  
 86 |       
 87 |       With probability self.epsilon, we should take a random action.
 88 |       otherwise - the best policy action (self.getPolicy).
 89 | 
 90 |       HINT: You might want to use util.flipCoin(prob)
 91 |       HINT: To pick randomly from a list, use random.choice(list)
 92 | 
 93 |     """
 94 |     
 95 |     # Pick Action
 96 |     possibleActions = self.getLegalActions(state)
 97 |     action = None
 98 |     
 99 |     #If there are no legal actions, return None
100 |     if len(possibleActions) == 0:
101 |     	return None
102 | 
103 |     #agent parameters:
104 |     epsilon = self.epsilon
105 | 
106 |     "*** YOUR CODE HERE ***"
107 |     
108 |     return <put agent's action here>
109 | 
110 |   def update(self, state, action, nextState, reward):
111 |     """
112 |       You should do your Q-Value update here
113 | 
114 |       NOTE: You should never call this function,
115 |       it will be called on your behalf
116 | 
117 | 
118 |     """
119 |     #agent parameters
120 |     gamma = self.discount
121 |     learning_rate = self.alpha
122 |     
123 |     "*** YOUR CODE HERE ***"    
124 |     reference_qvalue = <the "correct state value", uses reward and the value of next state>
125 |     
126 |     updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue
127 |     self.setQValue(state,action,updated_qvalue)
128 | 
129 | 
130 | #---------------------#end of your code#---------------------#
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/week3/sarsa.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Expected Value SARSA
  3 | This file builds upon the same functions as Q-learning agent (qlearning.py).
  4 | 
  5 | [assignment]
  6 | The only thing you must implement is the getValue method.
  7 | - Recall that V(s) in SARSA is not the maximal but the expected Q-value.
  8 | - The expectation should be done under agent's policy (e-greedy).
  9 | 
 10 | 
 11 | Here's usage example:
 12 | >>>from sarsa import SarsaAgent
 13 | 
 14 | >>>agent = SarsaAgent(alpha=0.1,epsilon=0.25,discount=0.99,
 15 |                        getLegalActions = lambda s: actions_from_that_state)
 16 | >>>action = agent.getAction(state)
 17 | >>>agent.update(state,action, next_state,reward)
 18 | >>>agent.epsilon *= 0.99
 19 | """
 20 | import random,math
 21 | 
 22 | import numpy as np
 23 | from collections import defaultdict
 24 | 
 25 | class SarsaAgent():
 26 |   """
 27 |     Classical SARSA agent.
 28 |     
 29 |     The two main methods are 
 30 |     - self.getAction(state) - returns agent's action in that state
 31 |     - self.update(state,action,reward,nextState,nextAction) - returns agent's next action
 32 | 
 33 |     Instance variables you have access to
 34 |       - self.epsilon (exploration prob)
 35 |       - self.alpha (learning rate)
 36 |       - self.discount (discount rate aka gamma)
 37 | 
 38 |   """
 39 |   def __init__(self,alpha,epsilon,discount,getLegalActions):
 40 |     "We initialize agent and Q-values here."
 41 |     self.getLegalActions= getLegalActions
 42 |     self._qValues = defaultdict(lambda:defaultdict(lambda:0))
 43 |     self.alpha = alpha
 44 |     self.epsilon = epsilon
 45 |     self.discount = discount
 46 | 
 47 |   def getQValue(self, state, action):
 48 |     """
 49 |       Returns Q(state,action)
 50 |     """
 51 |     return self._qValues[state][action]
 52 | 
 53 |   def setQValue(self,state,action,value):
 54 |     """
 55 |       Sets the Qvalue for [state,action] to the given value
 56 |     """
 57 |     self._qValues[state][action] = value
 58 | 
 59 | #---------------------#start of your code#---------------------#
 60 | 
 61 |   def getPolicy(self, state):
 62 |     """
 63 |       Compute the best action to take in a state. 
 64 |       
 65 |     """
 66 |     possibleActions = self.getLegalActions(state)
 67 | 
 68 |     #If there are no legal actions, return None
 69 |     if len(possibleActions) == 0:
 70 |     	return None
 71 |     
 72 |     best_action = None
 73 | 
 74 |     "*** this code works exactly as Q-learning ***"
 75 |     best_action = possibleActions[np.argmax([self.getQValue(state, a) for a in possibleActions])]
 76 |     return best_action
 77 | 
 78 |   def getAction(self, state):
 79 |     """
 80 |       Compute the action to take in the current state, including exploration.  
 81 |       
 82 |       With probability self.epsilon, we should take a random action.
 83 |       otherwise - the best policy action (self.getPolicy).
 84 | 
 85 |       HINT: You might want to use util.flipCoin(prob)
 86 |       HINT: To pick randomly from a list, use random.choice(list)
 87 | 
 88 |     """
 89 |     
 90 |     # Pick Action
 91 |     possibleActions = self.getLegalActions(state)
 92 |     action = None
 93 |     
 94 |     #If there are no legal actions, return None
 95 |     if len(possibleActions) == 0:
 96 |     	return None
 97 | 
 98 |     #agent parameters:
 99 |     epsilon = self.epsilon
100 | 
101 |     "*** Epsilon-greedy strategy exactly as Q-learning ***"
102 |     if np.random.random()<=epsilon:
103 |     	return random.choice(possibleActions)
104 |     else:
105 |     	action = self.getPolicy(state)
106 |     return action
107 | 
108 |   def update(self, state, action, nextState,nextAction, reward):
109 |     """
110 |       You should do your Q-Value update here
111 | 
112 |       NOTE: You should never call this function,
113 |       it will be called on your behalf
114 | 
115 | 
116 |     """
117 |     #agent parameters
118 |     gamma = self.discount
119 |     learning_rate = self.alpha
120 |     
121 |     "*** YOUR CODE HERE ***"    
122 |     reference_qvalue = <Your Code Here>
123 |     
124 |     updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue
125 |     
126 |     self.setQValue(state,action,updated_qvalue)
127 | 
128 | 
129 | #---------------------#end of your code#---------------------#
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/week4/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [__lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture4.pdf&name=lecture4.pdf&c=58b0d2eb4e0f)
 3 | * David Silver lecture - https://www.youtube.com/watch?v=UoPei5o4fps&t=3s
 4 | * More practical and less theoretical lecture from MIT 6.S191 - https://www.youtube.com/watch?v=xWe58WGWmlk
 5 | * Our [lecture](https://yadi.sk/i/AHDU2p_j3FT3nr), [seminar](https://yadi.sk/i/EeUeheri3FT3ra) (russian)
 6 | * Understanding approximate q-learning - https://danieltakeshi.github.io/2016/10/31/going-deeper-into-reinforcement-learning-understanding-q-learning-and-linear-function-approximation/
 7 | * Karpathy's post on approximate RL - http://karpathy.github.io/2016/05/31/rl/
 8 | 
 9 | ## More materials
10 | * __[recommended]__ How to _actually_ do deep reinforcement learning by J. Schulman - http://rll.berkeley.edu/deeprlcourse/docs/nuts-and-bolts.pdf
11 | * interactive demos in your browser: [demo1](http://cs.stanford.edu/people/karpathy/convnetjs/demo/rldemo.html)(karpathy), [demo2](http://janhuenermann.com/projects/learning-to-drive)(Hünermann)
12 | * A guide to deep RL from ~scratch (nervana blog) - https://www.nervanasys.com/demystifying-deep-reinforcement-learning/
13 | 
14 | 
15 | ## Homework
16 | 
17 | From now on, we introduce an alternative homework track that's not tied to lasagne/agentnet/rllab/any_other_framework. In that track, you'll be tasked with similar problems, but they will not be tied to jupyter notebooks with lasagne networks.
18 | 
19 | You can choose whichever track you want, but unless you're expertly familiar with your framework, we recommend you to start by completing the task in lasagne and only then reproduce your solution in your chosen framework.
20 | 
21 | 
22 | #### Recommended path
23 | 
24 | * Step 1 - go to [Seminar4.1](https://github.com/yandexdataschool/Practical_RL/blob/master/week4/Seminar4.1_experience_replay.ipynb), complete it and make sure it reaches the desired reward on Acrobot-v1. Then go to homework section (at the end) and follow the instructions from there.
25 |   * Tip - for your network to work properly on Acrobot-v1, please either use non-saturated nonlinearities (elu/leaky_relu/softplus), or normalize observations, or initialize with smaller weights. Otherwise, e.g. sigmoid may get saturated and fail to learn anything.
26 | * Step 2 - go to [Seminar4.2](https://github.com/yandexdataschool/Practical_RL/blob/master/week4/Seminar4.2_conv_agent.ipynb) and make it beat DoomBasic. 
27 |  
28 | Doom environments are powered by VizDoom (via doom_py), which may require separate installation. If you're using [docker container](https://github.com/yandexdataschool/Practical_RL/blob/master/docker) or running in binder, the dependency should already be installed.
29 | 
30 | To install doom envs manually, follow the instructions at the top of the [Seminar4.2](https://github.com/yandexdataschool/Practical_RL/blob/master/week4/Seminar4.2_conv_agent.ipynb) notebook.
31 | 
32 | For example, on python2, ubuntu 14, stardate 2017.02.27 it took us to
33 | ```
34 | apt-get install -y gcc g++ wget unzip libsdl2-dev libboost-all-dev
35 | pip install gym_pull
36 | pip install ppaquette-gym-doom
37 | ```
38 | 
39 | For macOS (OS X) install brew and then
40 | ```
41 | brew install boost boost-python sdl2 cmake
42 | pip install ppaquette-gym-doom
43 | ````
44 | 
45 | If it just won't get installed, pick `BreakoutDeterministic-v0` and try to get average reward >= +10
46 |   
47 | 
48 | #### Alternative frameworks
49 | 
50 | The task is to implement approximate Q-learning with experience replay and show that it works on `Acrobot-v1`,`LunarLander-v2` and `ppaquette/DoomBasic-v0` (or other versions of those environments).
51 | 
52 | If you use tensorflow, there's a very convenient [notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week4/Seminar4.0_recap_approx_qlearning-tf.ipynb) for you to start (by [Scitator](https://github.com/Scitator))
53 | 
54 | We, however, recommend you to read the lasagne/agentnet assignments briefly to get the grasp of what parameters to start from.
55 | 
56 | Your're also recommended to fit your solution in a notebook (ipython/torch/r) unless your framework is incompatible with that. In the latter case, please supply us some notes on what code lies where.
57 | 
58 | Bonus assignments remain exactly the same as in the first track.
59 | 
60 | Blindly copy-pasting code from any publically available demos will result in us interrogating you about every signifficant line of code to make sure you at least understand (and regret) what you copypasted.
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/week3.5/fix_my_nn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from lasagne.layers import *\n",
 12 |     "from lasagne.nonlinearities import *\n",
 13 |     "from lasagne import init"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {
 20 |     "collapsed": false
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "nn = InputLayer([None,3,100,100])\n",
 25 |     "\n",
 26 |     "nn = Conv2DLayer(nn,num_filters=512, filter_size=(3,3),\n",
 27 |     "                 W = init.Constant(0))\n",
 28 |     "\n",
 29 |     "nn = Conv2DLayer(nn,num_filters=128,filter_size=(3,3),\n",
 30 |     "                 W = init.Constant(0))\n",
 31 |     "\n",
 32 |     "nn = Conv2DLayer(nn,num_filters=32,filter_size=(3,3),\n",
 33 |     "                 W = init.Constant(0))\n",
 34 |     "\n",
 35 |     "nn = Pool2DLayer(nn,pool_size=(6,6),mode='max')\n",
 36 |     "\n",
 37 |     "nn = Conv2DLayer(nn,num_filters=8,filter_size=(10,10),\n",
 38 |     "                W = init.Normal(std=0.01))\n",
 39 |     "\n",
 40 |     "nn = Conv2DLayer(nn,num_filters=8,filter_size=(10,10),\n",
 41 |     "                W = init.Normal(std=0.01))\n",
 42 |     "\n",
 43 |     "nn = Pool2DLayer(nn,pool_size=(3,3),mode='max')\n",
 44 |     "\n",
 45 |     "nn = DenseLayer(nn,512,nonlinearity=softmax)\n",
 46 |     "\n",
 47 |     "nn = DropoutLayer(nn,p=0.5)\n",
 48 |     "\n",
 49 |     "nn = DenseLayer(nn,512,nonlinearity=softmax)\n",
 50 |     "\n",
 51 |     "nn = DenseLayer(nn,10,nonlinearity=sigmoid)\n",
 52 |     "\n",
 53 |     "nn = DropoutLayer(nn,p=0.5)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "```\n",
 61 |     "\n",
 62 |     "```\n",
 63 |     "\n",
 64 |     "```\n",
 65 |     "\n",
 66 |     "```\n",
 67 |     "\n",
 68 |     "```\n",
 69 |     "\n",
 70 |     "```\n",
 71 |     "\n",
 72 |     "```\n",
 73 |     "\n",
 74 |     "```\n",
 75 |     "\n",
 76 |     "```\n",
 77 |     "\n",
 78 |     "```\n",
 79 |     "\n",
 80 |     "```\n",
 81 |     "\n",
 82 |     "```\n",
 83 |     "\n",
 84 |     "```\n",
 85 |     "\n",
 86 |     "```\n",
 87 |     "\n",
 88 |     "```\n",
 89 |     "\n",
 90 |     "```\n",
 91 |     "\n",
 92 |     "```\n",
 93 |     "\n",
 94 |     "```\n",
 95 |     "\n",
 96 |     "```\n",
 97 |     "\n",
 98 |     "```\n",
 99 |     "\n",
100 |     "```\n",
101 |     "\n",
102 |     "```\n",
103 |     "\n",
104 |     "```\n",
105 |     "\n",
106 |     "```\n",
107 |     "\n",
108 |     "```\n",
109 |     "\n",
110 |     "```\n",
111 |     "\n",
112 |     "```\n",
113 |     "\n",
114 |     "```\n",
115 |     "\n",
116 |     "```\n",
117 |     "\n",
118 |     "```\n",
119 |     "\n",
120 |     "\n",
121 |     "# Book of grudges\n",
122 |     "* zero init for weights will cause symmetry effect\n",
123 |     "* Too many filters for first 3x3 convolution - will lead to enormous matrix while there's just not enough relevant combinations of 3x3 images (overkill).\n",
124 |     "* Usually the further you go, the more filters you need.\n",
125 |     "* large filters (10x10 is generally a bad pactice, and you definitely need more than 10 of them\n",
126 |     "* the second of 10x10 convolution gets 8x6x6 image as input, so it's technically unable to perform such convolution.\n",
127 |     "* Softmax nonlinearity effectively makes only 1 or a few neurons from the entire layer to \"fire\", rendering 512-neuron layer almost useless. Softmax at the output layer is okay though\n",
128 |     "* Dropout after probability prediciton is just lame. A few random classes get probability of 0, so your probabilities no longer sum to 1 and crossentropy goes -inf."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": []
139 |   }
140 |  ],
141 |  "metadata": {
142 |   "kernelspec": {
143 |    "display_name": "Python [Root]",
144 |    "language": "python",
145 |    "name": "Python [Root]"
146 |   },
147 |   "language_info": {
148 |    "codemirror_mode": {
149 |     "name": "ipython",
150 |     "version": 2
151 |    },
152 |    "file_extension": ".py",
153 |    "mimetype": "text/x-python",
154 |    "name": "python",
155 |    "nbconvert_exporter": "python",
156 |    "pygments_lexer": "ipython2",
157 |    "version": "2.7.12"
158 |   }
159 |  },
160 |  "nbformat": 4,
161 |  "nbformat_minor": 0
162 | }
163 | 


--------------------------------------------------------------------------------
/week2/README.md:
--------------------------------------------------------------------------------
 1 | ## Materials
 2 | * [__Lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture2.pdf&name=lecture2.pdf&c=58a61e22b9fb)
 3 | * Our [lecture](https://yadi.sk/i/cVawsPkK3EtGJj),[seminar](https://yadi.sk/i/dQmolwOy3EtGNK) (russian)
 4 | * [__main__] Lecture by David Silver (english): https://www.youtube.com/watch?v=PnHCvfgC_ZA
 5 | * Alternative lecture by Pieter Abbeel (english): https://www.youtube.com/watch?v=ifma8G7LegE
 6 | * Alternative lecture by John Schulmann (english): https://www.youtube.com/watch?v=IL3gVyJMmhg
 7 | 
 8 | ## Bonus materials
 9 | * Policy improvement theorems from Sutton book - http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node42.html
10 | * Lecture II by Dan Klein (english): https://www.youtube.com/watch?v=jUoZg513cdE
11 | * Qlearning guide from Habr (russian): https://habrahabr.ru/post/308094/
12 | * A great turorial/assignment on value-based methods from CS294 - https://github.com/berkeleydeeprlcourse/homework/blob/master/hw2/HW2.ipynb
13 | 
14 | ## Homework description:
15 | 
16 | For ease of access, we have 2 versions of the same homework. They feature the same algorithmic part but a bit different examples.
17 | 
18 | You can pick whichever one you prefer but mind the technical limitations. If you have a python2 on a local machine (NOT in docker), even if it's on Windows, we recommend the ./assignment one.
19 | 
20 | ## ./assignment
21 | _this assignment borrows code from awesome [cs188](http://ai.berkeley.edu/project_overview.html)_
22 | This homework assignment works on __python2 only__. If you stick to py3, consider alternative homework. Or just install it for this homework alone and remove afterwards.
23 | 
24 | This homework also requires some physical display (e.g. laptop monitor). It won't work on binder VM / headless server. Please run it on laptop or consider ./alternative
25 | 
26 | ### Part I (5 points)
27 | * Go to ./assignment, edit [__qlearningagents.py__](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py) (see instructions inside)
28 | * Make sure you can tune agent to beat ./run_crawler.sh
29 |  * on windows, just run `python crawler.py` from cmd in the project directory
30 | * other ./run* files are mostly for your amusement. 
31 |   * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/run_pacman.sh)
32 |   * on windows, just copy the type `python pacman.py -p PacmanQAgent -x 2000 -n 2010 -l smallGrid` in cmd from assignemnt dir
33 | (YSDA/HSE) Please submit only qlearningAgents.py file and include a brief text report as comments in it.
34 |   
35 | ### Part II (5+ points)
36 | _Please make a separate copy of qlearningAgents.py for this assignment_
37 | 
38 | The default tabular q-learning requires unrealistic amount of experience to learn anything useful on pacman tasks. This is mostly due to extremely large state space, combining positions of pacman, ghosts and all dots.
39 | 
40 | To speed up training you will need to implement a preprocessor that extracts new discrete features from state space. You can design these features to account only for the most important stuff around pacman. This time, it's okay to use environment-specific duct tape :)
41 | 
42 | Please read tips on how to solve them [__here__](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/homework_tips.md). Also, if you find some state spaces that work amaizingly good on pacman, weel free to propose a Pull Request with advices 
43 | 
44 | (HSE/YSDA) Please send us 
45 | * The alternative qlearningAgents.py file (and any other files you modified)
46 | * A short description of what you did there
47 | * How to run it. Usually something like `python pacman.py -p PacmanQAgent -x SOMETHING -n SOMETHING -l __mediumClassic__ -SOMETHING SOMETHING ...`
48 | * End of train/test log (or even whole log), including at least last iteration of learning and final statistics (especially winrate)
49 | 
50 | To get 5 points, your algorithm should solve __mediumGrid__ more than 50% times. Creative features and outstanding performance on __mediumClassic__ yields bonus points!
51 |  
52 | ## ./alternative
53 | Alternative homework description:
54 | * Go to [the notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/alternative/homework.ipynb)
55 | * The assignment is described there.
56 | * If you use binder/server, see week1 for example on how to run CartPole and other envs.
57 | 
58 | 
59 | ### Grading (alternative)
60 | * 5 points for implementing q-learning and testing on taxi
61 | * 5 points for solving CartPole-v0
62 | * bonus tasks listed inside
63 | 


--------------------------------------------------------------------------------
/week3/qlearning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Q-learning
  3 | This file contains the same q-learning agent you implemented in the previous assignment.
  4 | The only difference is that it doesn't need any other files with it, so you can use it as a standalone moule.
  5 | 
  6 | Here's an example:
  7 | >>>from qlearning import QLearningAgent
  8 | 
  9 | >>>agent = QLearningAgent(alpha=0.5,epsilon=0.25,discount=0.99,
 10 |                        getLegalActions = lambda s: actions_from_that_state)
 11 | >>>action = agent.getAction(state)
 12 | >>>agent.update(state,action, next_state,reward)
 13 | >>>agent.epsilon *= 0.99
 14 | """
 15 | 
 16 | import random,math
 17 | 
 18 | import numpy as np
 19 | from collections import defaultdict
 20 | 
 21 | class QLearningAgent():
 22 |   """
 23 |     Q-Learning Agent
 24 | 
 25 |     The two main methods are 
 26 |     - self.getAction(state) - returns agent's action in that state
 27 |     - self.update(state,action,nextState,reward) - returns agent's next action
 28 | 
 29 |     Functions you should use
 30 |       - self.getLegalActions(state)
 31 |         which returns legal actions for a state
 32 |       - self.getQValue(state,action)
 33 |         which returns Q(state,action)
 34 |       - self.setQValue(state,action,value)
 35 |         which sets Q(state,action) := value
 36 |     
 37 |     !!!Important!!!
 38 |     NOTE: please avoid using self._qValues directly to make code cleaner
 39 |   """
 40 |   def __init__(self,alpha,epsilon,discount,getLegalActions):
 41 |     "We initialize agent and Q-values here."
 42 |     self.getLegalActions= getLegalActions
 43 |     self._qValues = defaultdict(lambda:defaultdict(lambda:0))
 44 |     self.alpha = alpha
 45 |     self.epsilon = epsilon
 46 |     self.discount = discount
 47 | 
 48 |   def getQValue(self, state, action):
 49 |     """
 50 |       Returns Q(state,action)
 51 |     """
 52 |     return self._qValues[state][action]
 53 | 
 54 |   def setQValue(self,state,action,value):
 55 |     """
 56 |       Sets the Qvalue for [state,action] to the given value
 57 |     """
 58 |     self._qValues[state][action] = value
 59 | 
 60 | #---------------------#start of your code#---------------------#
 61 | 
 62 |   def getValue(self, state):
 63 |     """
 64 |       Returns max_action Q(state,action)
 65 |       where the max is over legal actions.
 66 |     """
 67 |     
 68 |     possibleActions = self.getLegalActions(state)
 69 |     #If there are no legal actions, return 0.0
 70 |     if len(possibleActions) == 0:
 71 |     	return 0.0
 72 | 
 73 |     "*** YOUR CODE HERE ***"
 74 |     return max([self.getQValue(state, a) for a in possibleActions])
 75 |     
 76 |   def getPolicy(self, state):
 77 |     """
 78 |       Compute the best action to take in a state. 
 79 |       
 80 |     """
 81 |     possibleActions = self.getLegalActions(state)
 82 | 
 83 |     #If there are no legal actions, return None
 84 |     if len(possibleActions) == 0:
 85 |     	return None
 86 |     
 87 |     best_action = None
 88 | 
 89 |     best_action = possibleActions[np.argmax([self.getQValue(state, a) for a in possibleActions])]
 90 |     return best_action
 91 | 
 92 |   def getAction(self, state):
 93 |     """
 94 |       Compute the action to take in the current state, including exploration.  
 95 |       
 96 |       With probability self.epsilon, we should take a random action.
 97 |       otherwise - the best policy action (self.getPolicy).
 98 | 
 99 |       HINT: You might want to use util.flipCoin(prob)
100 |       HINT: To pick randomly from a list, use random.choice(list)
101 | 
102 |     """
103 |     
104 |     # Pick Action
105 |     possibleActions = self.getLegalActions(state)
106 |     action = None
107 |     
108 |     #If there are no legal actions, return None
109 |     if len(possibleActions) == 0:
110 |     	return None
111 | 
112 |     #agent parameters:
113 |     epsilon = self.epsilon
114 | 
115 |     if np.random.random()<=epsilon:
116 |     	return random.choice(possibleActions)
117 |     else:
118 |     	action = self.getPolicy(state)
119 |     return action
120 | 
121 |   def update(self, state, action, nextState, reward):
122 |     """
123 |       You should do your Q-Value update here
124 | 
125 |       NOTE: You should never call this function,
126 |       it will be called on your behalf
127 | 
128 | 
129 |     """
130 |     #agent parameters
131 |     gamma = self.discount
132 |     learning_rate = self.alpha
133 |     
134 |     reference_qvalue = reward + gamma * self.getValue(nextState)
135 |     updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue
136 |     self.setQValue(state,action,updated_qvalue)
137 | 
138 | 
139 | #---------------------#end of your code#---------------------#
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/yet_another_week/README.md:
--------------------------------------------------------------------------------
 1 | In this week you can find several sections covering advanced topics in RL, along with less advanced topics that we couldn't squeeze into the main track
 2 | 
 3 | ## Advanced policy gradient methods
 4 | This section covers some steroids for policy gradient methods, along with a cool general trick called 
 5 | 
 6 | * Lecture on NPG and TRPO by J. Schulman - [video](https://www.youtube.com/watch?v=_t5fpZuuf-4)
 7 | * Alternative lecture on TRPO and open problems by... J. schulman - [video](https://www.youtube.com/watch?v=gb5Q2XL5c8A)
 8 | * Our [__slides__](https://yadi.sk/i/9j6S4WVp3HgEdn) on TRPO, video: [lecture](https://yadi.sk/i/1oyihBnm3HiKHm), [seminar](https://yadi.sk/i/b0ol2gUV3HiKKJ) (russian)
 9 | * Original articles - [TRPO](https://arxiv.org/abs/1502.05477), [NPG](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf)
10 | 
11 | 
12 | * __Assignment:__ [seminar_TRPO.ipynb](https://github.com/yandexdataschool/Practical_RL/blob/master/yet_another_week/seminar_TRPO.ipynb)
13 |   * TF version: [pending]
14 | 
15 | ## Model-based RL: Planning
16 | * Planning by dynamic programming (D. Silver) - [video](https://www.youtube.com/watch?v=Nd1-UUMVfz4)
17 | * Planning via tree search [videos 2-6 from CS188](https://www.youtube.com/channel/UCHBzJsIcRIVuzzHVYabikTQ)
18 | * Our lecture:
19 |   * Slides [part1](https://yadi.sk/i/3PM9zCP33J3ub3) (intro), [part2](https://yadi.sk/i/M03xvZ2y3JMQre) (pomdp)
20 |   * [Lecture](https://yadi.sk/i/lOAUu7o13JBHFz) & [seminar](https://yadi.sk/i/bkmjEZrk3JBHGF)
21 | * Monte-carlo tree search
22 |   *  Udacity video on monte-carlo tree search (first part of a chain) - [video](https://www.youtube.com/watch?v=onBYsen2_eA)
23 |   * Reminder: UCB-1 - [slides](https://www.cs.bham.ac.uk/internal/courses/robotics/lectures/ucb1.pdf)
24 |   * Monte-carlo tree search step-by-step by J.Levine - [video](https://www.youtube.com/watch?v=UXW2yZndl7U)
25 |   * Guide to MCTS (monte-carlo tree search) - [post](http://www.cameronius.com/research/mcts/about/index.html)
26 |   * Another guide to MCTS - [url](https://jeffbradberry.com/posts/2015/09/intro-to-monte-carlo-tree-search/)
27 | * Integrating learning and planning (D. Silver) - [video](https://www.youtube.com/watch?v=ItMutbeOHtc&t=1241s)
28 | 
29 | * __Assignment:__ [seminar_MCTS.ipynb](https://github.com/yandexdataschool/Practical_RL/blob/master/yet_another_week/seminar_MCTS.ipynb)
30 | 
31 | * Approximating the MCTS optimal actions - 5vision solution for deephack.RL, code by Mikhail Pavlov - [repo](https://github.com/5vision/uct_atari)
32 | 
33 | ## Reinforcement learning in large/continuous action spaces
34 | While you already know algorithms that will work with continuously many actions, it can't hurt to learn something more specialized.
35 |  * Deterministic policy gradient - [article](https://arxiv.org/pdf/1512.07679.pdf), [post+code](https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html)
36 |  * Stochastic value gradient - [article](https://arxiv.org/abs/1510.09142)
37 |  * Q-learning with normalized advantage functions - [article](https://arxiv.org/abs/1603.00748), [code1](https://github.com/carpedm20/NAF-tensorflow), [code2](http://bit.ly/2qx2087)
38 |  * Embedding large discrete action spaces for RL - [article](https://arxiv.org/pdf/1512.07679.pdf)
39 |  * Lecture by A. Seleznev, 5vision (russian) - [video](www.youtube.com/watch?v=j1L2FnanXPo&t=119m45s)
40 | 
41 | ## Other
42 | * Learning by imitation - [video](https://www.youtube.com/watch?v=kl_G95uKTHw), [assignment](http://rll.berkeley.edu/deeprlcourse/docs/hw1.pdf)(berkeley cs294)
43 | * Knowledge transfer in RL - [video](https://www.youtube.com/watch?v=Hx4XpVdJOI0)(berkeley cs294)
44 | * Inverse reinforcement learning - [video](https://www.youtube.com/watch?v=J2blDuU3X1I)
45 | * Hierarchical reinforcemnt learning - [pending]
46 | * [Your contribution]
47 | 
48 | ## A list of lists
49 | * [awesome_rl](https://github.com/aikorea/awesome-rl/) - a curated list of resources dedicated to reinforcement learning.
50 | * [junhyukoh's list](https://github.com/junhyukoh/deep-reinforcement-learning-papers)
51 | * [muupan's list](https://github.com/muupan/deep-reinforcement-learning-papers)
52 | * Courses:
53 |  * [CS294: deep reinforcement learning](http://rll.berkeley.edu/deeprlcourse/)
54 |  * [Silver's RL course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
55 |  * [Sutton's book, 2nd edition](http://incompleteideas.net/sutton/book/the-book-2nd.html)
56 | * [Implementations of many basic RL algorithms (raw and/or tensorflow)](https://github.com/dennybritz/reinforcement-learning)
57 | * Reddit: [General ML](https://www.reddit.com/r/MachineLearning/), [RL](https://www.reddit.com/r/reinforcementlearning/), [CS294](https://www.reddit.com/r/berkeleydeeprlcourse/)
58 | * [This great link you could have contributed]
59 | 
60 | 


--------------------------------------------------------------------------------
/week3/expected_value_sarsa.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Expected Value SARSA
  3 | This file builds upon the same functions as Q-learning agent (qlearning.py).
  4 | 
  5 | [assignment]
  6 | The only thing you must implement is the getValue method.
  7 | - Recall that V(s) in SARSA is not the maximal but the expected Q-value.
  8 | - The expectation should be done under agent's policy (e-greedy).
  9 | 
 10 | 
 11 | Here's usage example:
 12 | >>>from expected_value_sarsa import EVSarsaAgent
 13 | 
 14 | >>>agent = EVSarsaAgent(alpha=0.5,epsilon=0.25,discount=0.99,
 15 |                        getLegalActions = lambda s: actions_from_that_state)
 16 | >>>action = agent.getAction(state)
 17 | >>>agent.update(state,action, next_state,reward)
 18 | >>>agent.epsilon *= 0.99
 19 | """
 20 | 
 21 | import random,math
 22 | 
 23 | import numpy as np
 24 | from collections import defaultdict
 25 | 
 26 | class EVSarsaAgent():
 27 |   """
 28 |     Expected Value SARSA Agent.
 29 |     
 30 |     The two main methods are 
 31 |     - self.getAction(state) - returns agent's action in that state
 32 |     - self.update(state,action,nextState,reward) - returns agent's next action
 33 | 
 34 |     Instance variables you have access to
 35 |       - self.epsilon (exploration prob)
 36 |       - self.alpha (learning rate)
 37 |       - self.discount (discount rate aka gamma)
 38 | 
 39 |   """
 40 |   def __init__(self,alpha,epsilon,discount,getLegalActions):
 41 |     "We initialize agent and Q-values here."
 42 |     self.getLegalActions= getLegalActions
 43 |     self._qValues = defaultdict(lambda:defaultdict(lambda:0))
 44 |     self.alpha = alpha
 45 |     self.epsilon = epsilon
 46 |     self.discount = discount
 47 |     
 48 |   def getQValue(self, state, action):
 49 |     """
 50 |       Returns Q(state,action)
 51 |     """
 52 |     return self._qValues[state][action]
 53 | 
 54 |   def setQValue(self,state,action,value):
 55 |     """
 56 |       Sets the Qvalue for [state,action] to the given value
 57 |     """
 58 |     self._qValues[state][action] = value
 59 | 
 60 | #---------------------#start of your code#---------------------#
 61 | 
 62 |   def getValue(self, state):
 63 |     """
 64 |       Returns V(s) according to expected value SARSA algorithm
 65 |       This should be equal to expected action q-value over action probabilities defined
 66 |       by epsilon-greedy policy with current epsilon.
 67 |     """
 68 |     
 69 |     possibleActions = self.getLegalActions(state)
 70 |     #If there are no legal actions, return 0.0
 71 |     if len(possibleActions) == 0:
 72 |     	return 0.0
 73 | 
 74 |     #You'll need this to estimate action probabilities
 75 |     epsilon = self.epsilon
 76 |     
 77 |     value = <Your Code Here>
 78 |     return value
 79 |     
 80 |   def getPolicy(self, state):
 81 |     """
 82 |       Compute the best action to take in a state. 
 83 |       
 84 |     """
 85 |     possibleActions = self.getLegalActions(state)
 86 | 
 87 |     #If there are no legal actions, return None
 88 |     if len(possibleActions) == 0:
 89 |     	return None
 90 |     
 91 |     best_action = None
 92 | 
 93 |     best_action = possibleActions[np.argmax([self.getQValue(state, a) for a in possibleActions])]
 94 |     return best_action
 95 | 
 96 |   def getAction(self, state):
 97 |     """
 98 |       Compute the action to take in the current state, including exploration.  
 99 |       
100 |       With probability self.epsilon, we should take a random action.
101 |       otherwise - the best policy action (self.getPolicy).
102 | 
103 |       HINT: You might want to use util.flipCoin(prob)
104 |       HINT: To pick randomly from a list, use random.choice(list)
105 | 
106 |     """
107 |     
108 |     # Pick Action
109 |     possibleActions = self.getLegalActions(state)
110 |     action = None
111 |     
112 |     #If there are no legal actions, return None
113 |     if len(possibleActions) == 0:
114 |     	return None
115 | 
116 |     #agent parameters:
117 |     epsilon = self.epsilon
118 | 
119 |     if np.random.random()<=epsilon:
120 |     	return random.choice(possibleActions)
121 |     else:
122 |     	action = self.getPolicy(state)
123 |     return action
124 | 
125 |   def update(self, state, action, nextState, reward):
126 |     """
127 |       You should do your Q-Value update here
128 | 
129 |       NOTE: You should never call this function,
130 |       it will be called on your behalf
131 | 
132 | 
133 |     """
134 |     #agent parameters
135 |     gamma = self.discount
136 |     learning_rate = self.alpha
137 |     
138 |     reference_qvalue = reward + gamma * self.getValue(nextState)
139 |     updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue
140 |     self.setQValue(state,action,updated_qvalue)
141 | 
142 | 
143 | #---------------------#end of your code#---------------------#
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/week2/assignment/qlearningAgents.py:
--------------------------------------------------------------------------------
  1 | # qlearningAgents.py
  2 | # ------------------
  3 | ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  4 | 
  5 | from game import *
  6 | from learningAgents import ReinforcementAgent
  7 | from featureExtractors import *
  8 | 
  9 | import random,util,math
 10 | from collections import defaultdict
 11 | 
 12 | class QLearningAgent(ReinforcementAgent):
 13 |   """
 14 |     Q-Learning Agent
 15 | 
 16 |     Instance variables you have access to
 17 |       - self.epsilon (exploration prob)
 18 |       - self.alpha (learning rate)
 19 |       - self.discount (discount rate aka gamma)
 20 | 
 21 |     Functions you should use
 22 |       - self.getLegalActions(state)
 23 |         which returns legal actions for a state
 24 |       - self.getQValue(state,action)
 25 |         which returns Q(state,action)
 26 |       - self.setQValue(state,action,value)
 27 |         which sets Q(state,action) := value
 28 |     
 29 |     !!!Important!!!
 30 |     NOTE: please avoid using self._qValues directly to make code cleaner
 31 |   """
 32 |   def __init__(self, **args):
 33 |     "We initialize agent and Q-values here."
 34 |     ReinforcementAgent.__init__(self, **args)
 35 |     self._qValues = defaultdict(lambda:defaultdict(lambda:0))
 36 |     
 37 | 
 38 |   def getQValue(self, state, action):
 39 |     """
 40 |       Returns Q(state,action)
 41 |     """
 42 |     return self._qValues[state][action]
 43 | 
 44 |   def setQValue(self,state,action,value):
 45 |     """
 46 |       Sets the Qvalue for [state,action] to the given value
 47 |     """
 48 |     self._qValues[state][action] = value
 49 | 
 50 | #---------------------#start of your code#---------------------#
 51 | 
 52 |   def getValue(self, state):
 53 |     """
 54 |       Returns max_action Q(state,action)
 55 |       where the max is over legal actions.
 56 |     """
 57 |     
 58 |     possibleActions = self.getLegalActions(state)
 59 |     #If there are no legal actions, return 0.0
 60 |     if len(possibleActions) == 0:
 61 |     	return 0.0
 62 | 
 63 |     "*** YOUR CODE HERE ***"
 64 |     raise NotImplementedError
 65 | 
 66 |     return 0.
 67 |     
 68 |   def getPolicy(self, state):
 69 |     """
 70 |       Compute the best action to take in a state. 
 71 |       
 72 |     """
 73 |     possibleActions = self.getLegalActions(state)
 74 | 
 75 |     #If there are no legal actions, return None
 76 |     if len(possibleActions) == 0:
 77 |     	return None
 78 |     
 79 |     best_action = None
 80 | 
 81 |     "*** YOUR CODE HERE ***"
 82 |     raise NotImplementedError
 83 | 
 84 |     return best_action
 85 | 
 86 |   def getAction(self, state):
 87 |     """
 88 |       Compute the action to take in the current state, including exploration.  
 89 |       
 90 |       With probability self.epsilon, we should take a random action.
 91 |       otherwise - the best policy action (self.getPolicy).
 92 | 
 93 |       HINT: You might want to use util.flipCoin(prob)
 94 |       HINT: To pick randomly from a list, use random.choice(list)
 95 | 
 96 |     """
 97 |     
 98 |     # Pick Action
 99 |     possibleActions = self.getLegalActions(state)
100 |     action = None
101 |     
102 |     #If there are no legal actions, return None
103 |     if len(possibleActions) == 0:
104 |     	return None
105 | 
106 |     #agent parameters:
107 |     epsilon = self.epsilon
108 | 
109 |     "*** YOUR CODE HERE ***"
110 |     raise NotImplementedError    
111 | 
112 |     return action
113 | 
114 |   def update(self, state, action, nextState, reward):
115 |     """
116 |       You should do your Q-Value update here
117 | 
118 |       NOTE: You should never call this function,
119 |       it will be called on your behalf
120 | 
121 | 
122 |     """
123 |     #agent parameters
124 |     gamma = self.discount
125 |     learning_rate = self.alpha
126 |     
127 |     "*** YOUR CODE HERE ***"
128 |     raise NotImplementedError
129 |     
130 |     reference_qvalue = PleaseImplementMe
131 |     updated_qvalue = PleaseImplementMe
132 | 
133 |     self.setQValue(PleaseImplementMe,PleaseImplementMe,updated_qvalue)
134 | 
135 | 
136 | #---------------------#end of your code#---------------------#
137 | 
138 | 
139 | 
140 | class PacmanQAgent(QLearningAgent):
141 |   "Exactly the same as QLearningAgent, but with different default parameters"
142 | 
143 |   def __init__(self, epsilon=0.05,gamma=0.8,alpha=0.2, numTraining=0, **args):
144 |     """
145 |     These default parameters can be changed from the pacman.py command line.
146 |     For example, to change the exploration rate, try:
147 |         python pacman.py -p PacmanQLearningAgent -a epsilon=0.1
148 | 
149 |     alpha    - learning rate
150 |     epsilon  - exploration rate
151 |     gamma    - discount factor
152 |     numTraining - number of training episodes, i.e. no learning after these many episodes
153 |     """
154 |     args['epsilon'] = epsilon
155 |     args['gamma'] = gamma
156 |     args['alpha'] = alpha
157 |     args['numTraining'] = numTraining
158 |     self.index = 0  # This is always Pacman
159 |     QLearningAgent.__init__(self, **args)
160 | 
161 |   def getAction(self, state):
162 |     """
163 |     Simply calls the getAction method of QLearningAgent and then
164 |     informs parent of action for Pacman.  Do not change or remove this
165 |     method.
166 |     """
167 |     action = QLearningAgent.getAction(self,state)
168 |     self.doAction(state,action)
169 |     return action
170 | 
171 | 
172 | 
173 | class ApproximateQAgent(PacmanQAgent):
174 |     pass
175 | 


--------------------------------------------------------------------------------
/week2/assignment/layout.py:
--------------------------------------------------------------------------------
  1 | # layout.py
  2 | # ---------
  3 | # Licensing Information: Please do not distribute or publish solutions to this
  4 | # project. You are free to use and extend these projects for educational
  5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
  6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
  7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  8 | 
  9 | from util import manhattanDistance
 10 | from game import Grid
 11 | import os
 12 | import random
 13 | 
 14 | VISIBILITY_MATRIX_CACHE = {}
 15 | 
 16 | class Layout:
 17 |   """
 18 |   A Layout manages the static information about the game board.
 19 |   """
 20 |   
 21 |   def __init__(self, layoutText):
 22 |     self.width = len(layoutText[0])
 23 |     self.height= len(layoutText)
 24 |     self.walls = Grid(self.width, self.height, False)
 25 |     self.food = Grid(self.width, self.height, False)
 26 |     self.capsules = []
 27 |     self.agentPositions = []
 28 |     self.numGhosts = 0
 29 |     self.processLayoutText(layoutText)
 30 |     self.layoutText = layoutText
 31 |     # self.initializeVisibilityMatrix()
 32 |     
 33 |   def getNumGhosts(self):
 34 |     return self.numGhosts
 35 |     
 36 |   def initializeVisibilityMatrix(self):
 37 |     global VISIBILITY_MATRIX_CACHE
 38 |     if reduce(str.__add__, self.layoutText) not in VISIBILITY_MATRIX_CACHE:
 39 |       from game import Directions
 40 |       vecs = [(-0.5,0), (0.5,0),(0,-0.5),(0,0.5)]
 41 |       dirs = [Directions.NORTH, Directions.SOUTH, Directions.WEST, Directions.EAST]
 42 |       vis = Grid(self.width, self.height, {Directions.NORTH:set(), Directions.SOUTH:set(), Directions.EAST:set(), Directions.WEST:set(), Directions.STOP:set()})
 43 |       for x in range(self.width):
 44 |         for y in range(self.height):
 45 |           if self.walls[x][y] == False:
 46 |             for vec, direction in zip(vecs, dirs):
 47 |               dx, dy = vec
 48 |               nextx, nexty = x + dx, y + dy
 49 |               while (nextx + nexty) != int(nextx) + int(nexty) or not self.walls[int(nextx)][int(nexty)] :
 50 |                 vis[x][y][direction].add((nextx, nexty))
 51 |                 nextx, nexty = x + dx, y + dy
 52 |       self.visibility = vis      
 53 |       VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] = vis
 54 |     else:
 55 |       self.visibility = VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)]
 56 |       
 57 |   def isWall(self, pos):
 58 |     x, col = pos
 59 |     return self.walls[x][col]
 60 |   
 61 |   def getRandomLegalPosition(self):
 62 |     x = random.choice(range(self.width))
 63 |     y = random.choice(range(self.height))
 64 |     while self.isWall( (x, y) ):
 65 |       x = random.choice(range(self.width))
 66 |       y = random.choice(range(self.height))
 67 |     return (x,y)
 68 | 
 69 |   def getRandomCorner(self):
 70 |     poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)]
 71 |     return random.choice(poses)
 72 | 
 73 |   def getFurthestCorner(self, pacPos):
 74 |     poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)]
 75 |     dist, pos = max([(manhattanDistance(p, pacPos), p) for p in poses])
 76 |     return pos
 77 |   
 78 |   def isVisibleFrom(self, ghostPos, pacPos, pacDirection):
 79 |     row, col = [int(x) for x in pacPos]
 80 |     return ghostPos in self.visibility[row][col][pacDirection]
 81 |   
 82 |   def __str__(self):
 83 |     return "\n".join(self.layoutText)
 84 |     
 85 |   def deepCopy(self):
 86 |     return Layout(self.layoutText[:])
 87 |     
 88 |   def processLayoutText(self, layoutText):
 89 |     """
 90 |     Coordinates are flipped from the input format to the (x,y) convention here
 91 |     
 92 |     The shape of the maze.  Each character  
 93 |     represents a different type of object.   
 94 |      % - Wall                               
 95 |      . - Food
 96 |      o - Capsule
 97 |      G - Ghost
 98 |      P - Pacman
 99 |     Other characters are ignored.
100 |     """
101 |     maxY = self.height - 1
102 |     for y in range(self.height):       
103 |       for x in range(self.width):
104 |         layoutChar = layoutText[maxY - y][x]  
105 |         self.processLayoutChar(x, y, layoutChar)
106 |     self.agentPositions.sort()
107 |     self.agentPositions = [ ( i == 0, pos) for i, pos in self.agentPositions]
108 |   
109 |   def processLayoutChar(self, x, y, layoutChar):
110 |     if layoutChar == '%':      
111 |       self.walls[x][y] = True
112 |     elif layoutChar == '.':
113 |       self.food[x][y] = True 
114 |     elif layoutChar == 'o':    
115 |       self.capsules.append((x, y))   
116 |     elif layoutChar == 'P':    
117 |       self.agentPositions.append( (0, (x, y) ) )
118 |     elif layoutChar in ['G']:    
119 |       self.agentPositions.append( (1, (x, y) ) )
120 |       self.numGhosts += 1
121 |     elif layoutChar in  ['1', '2', '3', '4']:
122 |       self.agentPositions.append( (int(layoutChar), (x,y)))
123 |       self.numGhosts += 1 
124 | def getLayout(name, back = 2):
125 |   if name.endswith('.lay'):
126 |     layout = tryToLoad('layouts/' + name)
127 |     if layout == None: layout = tryToLoad(name)
128 |   else:
129 |     layout = tryToLoad('layouts/' + name + '.lay')
130 |     if layout == None: layout = tryToLoad(name + '.lay')
131 |   if layout == None and back >= 0:
132 |     curdir = os.path.abspath('.')
133 |     os.chdir('..')
134 |     layout = getLayout(name, back -1)
135 |     os.chdir(curdir)
136 |   return layout
137 | 
138 | def tryToLoad(fullname):
139 |   if(not os.path.exists(fullname)): return None
140 |   f = open(fullname)
141 |   try: return Layout([line.strip() for line in f])
142 |   finally: f.close()


--------------------------------------------------------------------------------
/week9/bayes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A single-file module that makes your lasagne network into a bayesian neural net.
  3 | Originally created by github.com/ferrine , rewritten by github.com/justheuristic for simplicity
  4 | 
  5 | See example in the notebook
  6 | """
  7 | 
  8 | import numpy as np
  9 | 
 10 | from theano import tensor as T
 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 12 | 
 13 | import lasagne
 14 | from lasagne import init
 15 | from lasagne.random import get_rng
 16 | 
 17 | from functools import wraps
 18 | 
 19 | __all__ = ['NormalApproximation','get_var_cost','bbpwrap']
 20 | 
 21 | 
 22 | 
 23 | class NormalApproximation(object):
 24 |     def __init__(self, mu=0, std=np.exp(-3),seed=None):
 25 |         """
 26 |         Approximation that samples network weights from factorized normal distribution.
 27 |         
 28 |         :param mu: prior mean for gaussian weights
 29 |         :param std: prior std for gaussian weights
 30 |         :param seed: random seed
 31 |         """
 32 |         self.prior_mu = mu
 33 |         self.prior_std = std
 34 |         self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579))
 35 |         
 36 |     def log_normal(self,x, mean, std, eps=0.0):
 37 |         """computes log-proba of normal distribution"""
 38 |         std += eps
 39 |         return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - (x - mean) ** 2 / (2 * std ** 2)
 40 | 
 41 |     def log_prior(self, weights):
 42 |         """
 43 |         Logarithm of prior probabilities for weights: 
 44 |         log P(weights) aka log P(theta)
 45 |         """
 46 |         return self.log_normal(weights, self.prior_mu, self.prior_std)
 47 | 
 48 |     def log_posterior_approx(self,weights, mean, rho):
 49 |         """
 50 |         Logarithm of ELBO on posterior probabilities:
 51 |         log q(weights|learned mu and rho) aka log q(theta|x)
 52 |         """
 53 |         std = T.log1p(T.exp(rho))  #rho to std
 54 |         return self.log_normal(weights, mean, std)
 55 | 
 56 |     def __call__(self, layer, spec, shape, name=None, **tags):
 57 |         # case when user uses default init specs
 58 |         assert tags.get('variational',False) == True, "Please declare param as variational to avoid confusion"
 59 |         
 60 |         if not isinstance(spec, dict):
 61 |             initial_rho = np.log(np.expm1(self.prior_std))   #std to rho
 62 |             assert np.isfinite(initial_rho),"too small std to initialize correctly. Please pass explicit"\
 63 |                                             " initializer (dict with {'mu':mu_init, 'rho':rho_init})."
 64 |             spec = {'mu': spec,'rho':init.Constant(initial_rho)}
 65 |             
 66 | 
 67 |         mu_spec,rho_spec = spec['mu'],spec['rho']
 68 |         
 69 |         rho = layer.add_param(rho_spec, shape,name=(name or 'unk')+'.rho', **tags)
 70 |         mean = layer.add_param(mu_spec, shape,name=(name or 'unk')+'.mu', **tags)
 71 | 
 72 |         #Reparameterization trick
 73 |         e = self.srng.normal(shape, std=1)  
 74 |         W = mean + T.log1p(T.exp(rho)) * e 
 75 | 
 76 |         #KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka variational cost
 77 |         q_p = T.sum(self.log_posterior_approx(W, mean, rho) - self.log_prior(W))
 78 |             
 79 |         #accumulate variational cost
 80 |         layer._bbwrap_var_cost += q_p
 81 |         return W
 82 | 
 83 | 
 84 | 
 85 | def get_var_cost(layer_or_layers,treat_as_input=None):
 86 |     """
 87 |     Returns total variational cost aka KL(q(theta|x)||p(theta)) for all layers in the network
 88 |     
 89 |     :param layer_or_layers: top layer(s) of your network, just like with lasagne.layers.get_output
 90 |     :param treat_as_input: don't accumulate over layers below these layers. See same param for lasagne.layers.get_all_layers
 91 |     
 92 |     Alternatively, one can manually get weights for one layer via layer.get_var_cost()
 93 |     """
 94 |     cost = 0
 95 |     for layer in lasagne.layers.get_all_layers(layer_or_layers,treat_as_input):
 96 |         if hasattr(layer, 'get_var_cost'): #if layer is bayesian or pretends so
 97 |             cost += layer.get_var_cost()
 98 |     return cost
 99 | 
100 | def bbpwrap(approximation=NormalApproximation()):
101 |     """
102 |     A decorator that makes arbitrary lasagne layer into a bayesian network layer:
103 |     BayesDenseLayer = bbwrap()(DenseLayer)
104 |     or more verbosely,
105 |     @bbpwrap(NormalApproximation(pstd=0.01))
106 |     BayesDenseLayer(DenseLayer):
107 |         pass
108 | 
109 |     """
110 |     
111 |     def decorator(cls):
112 |         def add_param_wrap(add_param):
113 |             @wraps(add_param)
114 |             def wrapped(self, spec, shape, name=None, **tags):
115 |                 # we should take care about some user specification
116 |                 # to avoid bbp hook just set tags['variational'] = True
117 |                 if not tags.get('trainable', True) or tags.get('variational', False):
118 |                     return add_param(self, spec, shape, name, **tags)
119 |                 else:
120 |                     # we declare that params we add next
121 |                     # are the ones we need to fit the distribution
122 |                     # they don't need to be regularized, strictly
123 |                     tags['variational'] = True
124 |                     tags['regularizable'] = False
125 |                     param = self.approximation(self, spec, shape, name, **tags)
126 |                     return param
127 |             return wrapped
128 |         
129 |         def get_var_cost(self):
130 |             """
131 |             Returns total variational cost aka KL(q(theta|x)||p(theta)) for this layer.
132 |             Alternatively, use function get_var_cost(layer) to get total cost for all layers below this one.
133 |             """
134 |             return self._bbwrap_var_cost
135 |         
136 | 
137 |         cls.approximation = approximation
138 |         cls._bbwrap_var_cost=0
139 |         cls.add_param = add_param_wrap(cls.add_param)
140 |         cls.get_var_cost = get_var_cost
141 |         return cls
142 |     
143 |     
144 |     return decorator
145 | 


--------------------------------------------------------------------------------
/week2/assignment/learningAgents.py:
--------------------------------------------------------------------------------
  1 | # learningAgents.py
  2 | # -----------------
  3 | # Licensing Information: Please do not distribute or publish solutions to this
  4 | # project. You are free to use and extend these projects for educational
  5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
  6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
  7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  8 | 
  9 | from game import Directions, Agent, Actions
 10 | 
 11 | import random,util,time
 12 | 
 13 | class ValueEstimationAgent(Agent):
 14 |   """
 15 |     Abstract agent which assigns values to (state,action)
 16 |     Q-Values for an environment. As well as a value to a
 17 |     state and a policy given respectively by,
 18 | 
 19 |     V(s) = max_{a in actions} Q(s,a)
 20 |     policy(s) = arg_max_{a in actions} Q(s,a)
 21 | 
 22 |     Both ValueIterationAgent and QLearningAgent inherit
 23 |     from this agent. While a ValueIterationAgent has
 24 |     a model of the environment via a MarkovDecisionProcess
 25 |     (see mdp.py) that is used to estimate Q-Values before
 26 |     ever actually acting, the QLearningAgent estimates
 27 |     Q-Values while acting in the environment.
 28 |   """
 29 | 
 30 |   def __init__(self, alpha=1.0, epsilon=0.05, gamma=0.8, numTraining = 10):
 31 |     """
 32 |     Sets options, which can be passed in via the Pacman command line using -a alpha=0.5,...
 33 |     alpha    - learning rate
 34 |     epsilon  - exploration rate
 35 |     gamma    - discount factor
 36 |     numTraining - number of training episodes, i.e. no learning after these many episodes
 37 |     """
 38 |     self.alpha = float(alpha)
 39 |     self.epsilon = float(epsilon)
 40 |     self.discount = float(gamma)
 41 |     self.numTraining = int(numTraining)
 42 | 
 43 |   ####################################
 44 |   #    Override These Functions      #
 45 |   ####################################
 46 |   def getQValue(self, state, action):
 47 |     """
 48 |     Should return Q(state,action)
 49 |     """
 50 |     util.raiseNotDefined()
 51 | 
 52 |   def getValue(self, state):
 53 |     """
 54 |     What is the value of this state under the best action?
 55 |     Concretely, this is given by
 56 | 
 57 |     V(s) = max_{a in actions} Q(s,a)
 58 |     """
 59 |     util.raiseNotDefined()
 60 | 
 61 |   def getPolicy(self, state):
 62 |     """
 63 |     What is the best action to take in the state. Note that because
 64 |     we might want to explore, this might not coincide with getAction
 65 |     Concretely, this is given by
 66 | 
 67 |     policy(s) = arg_max_{a in actions} Q(s,a)
 68 | 
 69 |     If many actions achieve the maximal Q-value,
 70 |     it doesn't matter which is selected.
 71 |     """
 72 |     util.raiseNotDefined()
 73 | 
 74 |   def getAction(self, state):
 75 |     """
 76 |     state: can call state.getLegalActions()
 77 |     Choose an action and return it.
 78 |     """
 79 |     util.raiseNotDefined()
 80 | 
 81 | class ReinforcementAgent(ValueEstimationAgent):
 82 |   """
 83 |     Abstract Reinforcemnt Agent: A ValueEstimationAgent
 84 | 	  which estimates Q-Values (as well as policies) from experience
 85 | 	  rather than a model
 86 | 
 87 |       What you need to know:
 88 | 		  - The environment will call
 89 | 		    observeTransition(state,action,nextState,deltaReward),
 90 | 		    which will call update(state, action, nextState, deltaReward)
 91 | 		    which you should override.
 92 |       - Use self.getLegalActions(state) to know which actions
 93 | 		    are available in a state
 94 |   """
 95 |   ####################################
 96 |   #    Override These Functions      #
 97 |   ####################################
 98 | 
 99 |   def update(self, state, action, nextState, reward):
100 |     """
101 | 	    This class will call this function, which you write, after
102 | 	    observing a transition and reward
103 |     """
104 |     util.raiseNotDefined()
105 | 
106 |   ####################################
107 |   #    Read These Functions          #
108 |   ####################################
109 | 
110 |   def getLegalActions(self,state):
111 |     """
112 |       Get the actions available for a given
113 |       state. This is what you should use to
114 |       obtain legal actions for a state
115 |     """
116 |     return self.actionFn(state)
117 | 
118 |   def observeTransition(self, state,action,nextState,deltaReward):
119 |     """
120 |     	Called by environment to inform agent that a transition has
121 |     	been observed. This will result in a call to self.update
122 |     	on the same arguments
123 | 
124 |     	NOTE: Do *not* override or call this function
125 |     """
126 |     self.episodeRewards += deltaReward
127 |     self.update(state,action,nextState,deltaReward)
128 | 
129 |   def startEpisode(self):
130 |     """
131 |       Called by environment when new episode is starting
132 |     """
133 |     self.lastState = None
134 |     self.lastAction = None
135 |     self.episodeRewards = 0.0
136 | 
137 |   def stopEpisode(self):
138 |     """
139 |       Called by environment when episode is done
140 |     """
141 |     if self.episodesSoFar < self.numTraining:
142 | 		  self.accumTrainRewards += self.episodeRewards
143 |     else:
144 | 		  self.accumTestRewards += self.episodeRewards
145 |     self.episodesSoFar += 1
146 |     if self.episodesSoFar >= self.numTraining:
147 |       # Take off the training wheels
148 |       self.epsilon = 0.0    # no exploration
149 |       self.alpha = 0.0      # no learning
150 | 
151 |   def isInTraining(self):
152 |       return self.episodesSoFar < self.numTraining
153 | 
154 |   def isInTesting(self):
155 |       return not self.isInTraining()
156 | 
157 |   def __init__(self, actionFn = None, numTraining=100, epsilon=0.5, alpha=0.5, gamma=1):
158 |     """
159 |     actionFn: Function which takes a state and returns the list of legal actions
160 | 
161 |     alpha    - learning rate
162 |     epsilon  - exploration rate
163 |     gamma    - discount factor
164 |     numTraining - number of training episodes, i.e. no learning after these many episodes
165 |     """
166 |     if actionFn == None:
167 |         actionFn = lambda state: state.getLegalActions()
168 |     self.actionFn = actionFn
169 |     self.episodesSoFar = 0
170 |     self.accumTrainRewards = 0.0
171 |     self.accumTestRewards = 0.0
172 |     self.numTraining = int(numTraining)
173 |     self.epsilon = float(epsilon)
174 |     self.alpha = float(alpha)
175 |     self.discount = float(gamma)
176 | 
177 |   ################################
178 |   # Controls needed for Crawler  #
179 |   ################################
180 |   def setEpsilon(self, epsilon):
181 |     self.epsilon = epsilon
182 | 
183 |   def setLearningRate(self, alpha):
184 |     self.alpha = alpha
185 | 
186 |   def setDiscount(self, discount):
187 |     self.discount = discount
188 | 
189 |   def doAction(self,state,action):
190 |     """
191 |         Called by inherited class when
192 |         an action is taken in a state
193 |     """
194 |     self.lastState = state
195 |     self.lastAction = action
196 | 
197 |   ###################
198 |   # Pacman Specific #
199 |   ###################
200 |   def observationFunction(self, state):
201 |     """
202 |         This is where we ended up after our last action.
203 |         The simulation should somehow ensure this is called
204 |     """
205 |     if not self.lastState is None:
206 |         reward = state.getScore() - self.lastState.getScore()
207 |         self.observeTransition(self.lastState, self.lastAction, state, reward)
208 |     return state
209 | 
210 |   def registerInitialState(self, state):
211 |     self.startEpisode()
212 |     if self.episodesSoFar == 0:
213 |         print 'Beginning %d episodes of Training' % (self.numTraining)
214 | 
215 |   def final(self, state):
216 |     """
217 |       Called by Pacman game at the terminal state
218 |     """
219 |     deltaReward = state.getScore() - self.lastState.getScore()
220 |     self.observeTransition(self.lastState, self.lastAction, state, deltaReward)
221 |     self.stopEpisode()
222 | 
223 |     # Make sure we have this var
224 |     if not 'episodeStartTime' in self.__dict__:
225 |         self.episodeStartTime = time.time()
226 |     if not 'lastWindowAccumRewards' in self.__dict__:
227 |         self.lastWindowAccumRewards = 0.0
228 |     self.lastWindowAccumRewards += state.getScore()
229 | 
230 |     NUM_EPS_UPDATE = 100
231 |     if self.episodesSoFar % NUM_EPS_UPDATE == 0:
232 |         print 'Reinforcement Learning Status:'
233 |         windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE)
234 |         if self.episodesSoFar <= self.numTraining:
235 |             trainAvg = self.accumTrainRewards / float(self.episodesSoFar)
236 |             print '\tCompleted %d out of %d training episodes' % (
237 |                    self.episodesSoFar,self.numTraining)
238 |             print '\tAverage Rewards over all training: %.2f' % (
239 |                     trainAvg)
240 |         else:
241 |             testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining)
242 |             print '\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining)
243 |             print '\tAverage Rewards over testing: %.2f' % testAvg
244 |         print '\tAverage Rewards for last %d episodes: %.2f'  % (
245 |                 NUM_EPS_UPDATE,windowAvg)
246 |         print '\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime)
247 |         self.lastWindowAccumRewards = 0.0
248 |         self.episodeStartTime = time.time()
249 | 
250 |     if self.episodesSoFar == self.numTraining:
251 |         msg = 'Training Done (turning off epsilon and alpha)'
252 |         print '%s\n%s' % (msg,'-' * len(msg))
253 | 


--------------------------------------------------------------------------------
/week8/8.2_bonus.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Week8 bonus descriptions\n",
  8 |     "\n",
  9 |     "Here are some cool mini-projects you can try to dive deeper into the topic.\n",
 10 |     "\n",
 11 |     "## More metrics: BLEU (5+ pts)\n",
 12 |     "\n",
 13 |     "Pick BLEU or any other relevant metric, e.g. BLEU (e.g. from `nltk.bleu_score`).\n",
 14 |     "* Train model to maximize BLEU directly\n",
 15 |     "* How does levenshtein behave when maximizing BLEU and vice versa?\n",
 16 |     "* Compare this with how they behave when optimizing likelihood. \n",
 17 |     "\n",
 18 |     "(use default parameters for bleu: 4-gram, uniform weights)\n",
 19 |     "\n",
 20 |     "## Actor-critic (5+++ pts)\n",
 21 |     "\n",
 22 |     "While self-critical training provides a large reduction of gradient variance, it has a few drawbacks:\n",
 23 |     "- It requires a lot of additional computation during training\n",
 24 |     "- It doesn't adjust V(s) between decoder steps. (one value per sequence)\n",
 25 |     "\n",
 26 |     "There's a more general way of doing the same thing: learned baselines, also known as __advantage actor-critic__.\n",
 27 |     "\n",
 28 |     "There are two main ways to apply that:\n",
 29 |     "- __naive way__: compute V(s) once per training example.\n",
 30 |     "  - This only requires additional 1-unit linear dense layer that grows out of encoder, estimating V(s)\n",
 31 |     "  - (implement this to get main points)\n",
 32 |     "- __every step__: compute V(s) on each decoder step\n",
 33 |     "  - Again it's just an 1-unit dense layer (no nonlinearity), but this time it's inside decoder recurrence.\n",
 34 |     "  - (+3 pts additional for this guy)\n",
 35 |     "\n",
 36 |     "In both cases, you should train V(s) to minimize squared error $(V(s) - R(s,a))^2$ with R being actual levenshtein.\n",
 37 |     "You can then use $ A(s,a) = (R(s,a) - const(V(s))) $ for policy gradient.\n",
 38 |     "\n",
 39 |     "There's also one particularly interesting approach (+5 additional pts):\n",
 40 |     "- __combining SCST and actor-critic__:\n",
 41 |     "  - compute baseline $V(s)$ via self-critical sequence training (just like in main assignment)\n",
 42 |     "  - learn correction $ C(s,a_{:t}) = R(s,a) - V(s) $ by minimizing $(R(s,a) - V(s) - C(s,a_{:t}))^2 $\n",
 43 |     "  - use $ A(s,a_{:t}) = R(s,a) - V(s) - const(C(s,a_{:t})) $\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "\n",
 47 |     "## Implement attention (5+++ pts)\n",
 48 |     "\n",
 49 |     "Some seq2seq tasks can benefit from the attention mechanism. In addition to taking the _last_ time-step of encoder hidden state, we can allow decoder to peek on any time-step of his choice.\n",
 50 |     "\n",
 51 |     "![img](https://s30.postimg.org/f8um3kt5d/google_seq2seq_attention.gif)\n",
 52 |     "\n",
 53 |     "\n",
 54 |     "#### Recommended steps:\n",
 55 |     "__1)__ Modify encoder-decoder\n",
 56 |     "\n",
 57 |     "Learn to feed the entire encoder into the decoder. You can do so by sending encoder rnn layer directly into decoder (make sure there's no `only_return_final=True` for encoder rnn layer).\n",
 58 |     "\n",
 59 |     "```\n",
 60 |     "class decoder:\n",
 61 |     "    ...\n",
 62 |     "    encoder_rnn_input = InputLayer(encoder.rnn.output_shape, name='encoder rnn input for decoder')\n",
 63 |     "    ...\n",
 64 |     "    \n",
 65 |     "#decoder Recurrence\n",
 66 |     "rec = Recurrence(...,\n",
 67 |     "                 input_nonsequences = {decoder.encoder_rnn_input: encoder.rnn},\n",
 68 |     "                 )\n",
 69 |     "\n",
 70 |     "```\n",
 71 |     "\n",
 72 |     "For starters, you can take it's last tick (via SliceLayer) inside the decoder step and feed it as input to make sure it works.\n",
 73 |     "\n",
 74 |     "__2)__ Implement attention mechanism\n",
 75 |     "\n",
 76 |     "Next thing we'll need is to implement the math of attention.\n",
 77 |     "\n",
 78 |     "The simplest way to do so is to write a special layer. We gave you a prototype and some tests below.\n",
 79 |     "\n",
 80 |     "__3)__ Use attention inside decoder\n",
 81 |     "\n",
 82 |     "That's almost it! Now use `AttentionLayer` inside the decoder and feed it to back to lstm/gru/rnn (see code demo below).\n",
 83 |     "\n",
 84 |     "Train the full network just like you did before attention.\n",
 85 |     "\n",
 86 |     "__More points__ will be awwarded for comparing learning results of attention Vs no attention.\n",
 87 |     "\n",
 88 |     "__Bonus bonus:__ visualize attention vectors (>= +3 points)\n",
 89 |     "\n",
 90 |     "The best way to make sure your attention actually works is to visualize it.\n",
 91 |     "\n",
 92 |     "A simple way to do so is to obtain attention vectors from each tick (values __right after softmax__, not the layer outputs) and drawing those as images.\n",
 93 |     "\n",
 94 |     "#### step-by-step guide:\n",
 95 |     "- split AttentionLayer into two layers: _\"from start to softmax\"_ and _\"from softmax to output\"_\n",
 96 |     "- add outputs of the first layer to recurrence's `tracked_outputs`\n",
 97 |     "- compile a function that computes them\n",
 98 |     "- plt.imshow(them)\n",
 99 |     "\n",
100 |     "\n"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "import numpy as np\n",
112 |     "import theano,lasagne\n",
113 |     "import theano.tensor as T\n",
114 |     "from lasagne import init\n",
115 |     "from lasagne.layers import *"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "class AttentionLayer(MergeLayer):\n",
125 |     "    def __init__(self,decoder_h,encoder_rnn):\n",
126 |     "        #sanity checks\n",
127 |     "        assert len(decoder_h.output_shape)==2,\"please feed decoder 1 step activation as first param \"\n",
128 |     "        assert len(encoder_rnn.output_shape)==3, \"please feed full encoder rnn sequence as second param\"\n",
129 |     "        \n",
130 |     "        self.decoder_num_units = decoder_h.output_shape[-1]\n",
131 |     "        self.encoder_num_units = encoder.output_shape[-1]\n",
132 |     "\n",
133 |     "        #Here you should initialize all trainable parameters.\n",
134 |     "        #\n",
135 |     "        \n",
136 |     "        #use this syntax:\n",
137 |     "        self.add_param(spec=init.Normal(std=0.01), #or other initializer\n",
138 |     "                       shape=<shape tuple>,\n",
139 |     "                       name='<param name here>')\n",
140 |     "        \n",
141 |     "        \n",
142 |     "        MergeLayer.__init__(self,[decoder_h,encoder_rnn],name=\"attention\")\n",
143 |     "        \n",
144 |     "        \n",
145 |     "    def get_output_shape_for(self,input_shapes,**kwargs):\n",
146 |     "        \"\"\"return matrix of shape [batch_size, encoder num units]\"\"\"\n",
147 |     "        return (None,self.encoder_num_units)\n",
148 |     "        \n",
149 |     "    def get_output_for(self,inputs,**kwargs):\n",
150 |     "        \"\"\"\n",
151 |     "        takes (decoder_h, encoder_seq)\n",
152 |     "        decoder_h has shape [batch_size, decoder num_units]\n",
153 |     "        encoder_seq has shape [batch_size, sequence_length, encoder num_units]\n",
154 |     "        \n",
155 |     "        returns attention output: matrix of shape [batch_size, encoder num units]\n",
156 |     "        \n",
157 |     "        please read comments carefully before you start implementing\n",
158 |     "        \"\"\"\n",
159 |     "        decoder_h,encoder_seq = inputs\n",
160 |     "        \n",
161 |     "        #get symbolic batch-size / seq length. Also don't forget self.decoder_num_units above\n",
162 |     "        batch_size,seq_length,_ = tuple(encoder_seq.shape)\n",
163 |     "        \n",
164 |     "        #here's a recommended step-by-step guide for attention mechanism. \n",
165 |     "        #You are free to ignore it alltogether if you so wish\n",
166 |     "        \n",
167 |     "        #we repeat decoder activations to allign with encoder\n",
168 |     "        decoder_h_repeated = <cast decoder_h into [batch,seq_length,decoer_num_units] by \n",
169 |     "                              repeating it _seq_length_ times>\n",
170 |     "                             <use T.repeat and maybe some reshape>\n",
171 |     "        # ^--shape=[batch,seq_length,decoder_n_units]\n",
172 |     "        \n",
173 |     "        encoder_and_decoder_together = <concatenate repeated decoder and encoder over last axis>\n",
174 |     "        # ^--shape=[batch,seq_length,enc_n_units+dec_n_units]\n",
175 |     "        \n",
176 |     "        #here we flatten the tensor to simplify\n",
177 |     "        encoder_and_decoder_flat = T.reshape(encoder_and_decoder_together,(-1,encoder_and_decoder_together.shape[-1]))\n",
178 |     "        # ^--shape=[batch*seq_length,enc_n_units+dec_n_units]\n",
179 |     "        \n",
180 |     "        #here you use encoder_and_decoder_flat and some learned weights to predict attention logits\n",
181 |     "        #don't use softmax yet\n",
182 |     "        <your code here>\n",
183 |     "        attention_logits_flat = <logits to be used as attention weights>\n",
184 |     "        # ^--shape=[batch*seq_length,1]\n",
185 |     "        \n",
186 |     "        \n",
187 |     "        #here we reshape flat logits back into correct form\n",
188 |     "        assert attention_logits_flat.ndim==2\n",
189 |     "        attention_logits = attention_logits_flat.reshape((batch_size,seq_length))\n",
190 |     "        # ^--shape=[batch,seq_length]\n",
191 |     "        \n",
192 |     "        #here we apply softmax :)\n",
193 |     "        attention = T.nnet.softmax(attention_logits)\n",
194 |     "        # ^--shape=[batch,seq_length]\n",
195 |     "        \n",
196 |     "        #here we compute output\n",
197 |     "        output = (attention[:,:,None]*encoder_seq).sum(axis=1) #sum over seq_length\n",
198 |     "        # ^--shape=[batch,enc_n_units]\n",
199 |     "        \n",
200 |     "        return output\n"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "#demo code\n",
210 |     "\n",
211 |     "from numpy.random import randn\n",
212 |     "\n",
213 |     "dec_h_prev = InputLayer((None,50),T.constant(randn(5,50)),name='decoder h mock')\n",
214 |     "\n",
215 |     "enc = InputLayer((None,None,32),T.constant(randn(5,20,32)),name='encoder sequence mock')\n",
216 |     "\n",
217 |     "attention = AttentionLayer(dec_h_prev,enc)\n",
218 |     "\n",
219 |     "#now you can use attention as additonal input to your decoder\n",
220 |     "#LSTMCell(prev_cell,prev_out,input_or_inputs=(usual_input,attention))\n",
221 |     "\n",
222 |     "\n",
223 |     "#sanity check\n",
224 |     "demo_output = get_output(attention).eval()\n",
225 |     "print 'actual shape:',demo_output.shape\n",
226 |     "assert demo_output.shape == (5,32)\n",
227 |     "assert np.isfinite(demo_output)\n",
228 |     "\n"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {
235 |     "collapsed": true
236 |    },
237 |    "outputs": [],
238 |    "source": []
239 |   }
240 |  ],
241 |  "metadata": {
242 |   "kernelspec": {
243 |    "display_name": "Python 2",
244 |    "language": "python",
245 |    "name": "python2"
246 |   },
247 |   "language_info": {
248 |    "codemirror_mode": {
249 |     "name": "ipython",
250 |     "version": 2
251 |    },
252 |    "file_extension": ".py",
253 |    "mimetype": "text/x-python",
254 |    "name": "python",
255 |    "nbconvert_exporter": "python",
256 |    "pygments_lexer": "ipython2",
257 |    "version": "2.7.13"
258 |   }
259 |  },
260 |  "nbformat": 4,
261 |  "nbformat_minor": 2
262 | }
263 | 


--------------------------------------------------------------------------------
/week4/Seminar4.0_recap_approx_qlearning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Approximate q-learning\n",
  8 |     "\n",
  9 |     "In this notebook you will teach a lasagne neural network to do Q-learning."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "__Frameworks__ - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "%env THEANO_FLAGS='floatX=float32'\n",
 28 |     "import os\n",
 29 |     "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\"))==0:\n",
 30 |     "    !bash ../xvfb start\n",
 31 |     "    %env DISPLAY=:1"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import gym\n",
 43 |     "import numpy as np, pandas as pd\n",
 44 |     "import matplotlib.pyplot as plt\n",
 45 |     "%matplotlib inline"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "collapsed": false,
 53 |     "scrolled": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "env = gym.make(\"CartPole-v0\")\n",
 58 |     "env.reset()\n",
 59 |     "n_actions = env.action_space.n\n",
 60 |     "state_dim = env.observation_space.shape\n",
 61 |     "\n",
 62 |     "plt.imshow(env.render(\"rgb_array\"))"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "# Approximate (deep) Q-learning: building the network\n",
 70 |     "\n",
 71 |     "In this section we will build and train naive Q-learning with theano/lasagne"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "First step is initializing input variables"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": false
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import theano\n",
 90 |     "import theano.tensor as T\n",
 91 |     "\n",
 92 |     "#create input variables. We'll support multiple states at once\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "current_states = T.matrix(\"states[batch,units]\")\n",
 96 |     "actions = T.ivector(\"action_ids[batch]\")\n",
 97 |     "rewards = T.vector(\"rewards[batch]\")\n",
 98 |     "next_states = T.matrix(\"next states[batch,units]\")\n",
 99 |     "is_end = T.ivector(\"vector[batch] where 1 means that session just ended\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "import lasagne\n",
111 |     "from lasagne.layers import *\n",
112 |     "\n",
113 |     "#input layer\n",
114 |     "l_states = InputLayer((None,)+state_dim)\n",
115 |     "\n",
116 |     "\n",
117 |     "<Your architecture. Please start with a single-layer network>\n",
118 |     "\n",
119 |     "\n",
120 |     "#output layer\n",
121 |     "l_qvalues = DenseLayer(<previous_layer>,num_units=n_actions,nonlinearity=None)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "#### Predicting Q-values for `current_states`"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "#get q-values for ALL actions in current_states\n",
140 |     "predicted_qvalues = get_output(l_qvalues,{l_states:current_states})"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "collapsed": true
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "#compiling agent's \"GetQValues\" function\n",
152 |     "get_qvalues = <compile a function that takes current_states and returns predicted_qvalues>"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "#select q-values for chosen actions\n",
164 |     "predicted_qvalues_for_actions = predicted_qvalues[T.arange(actions.shape[0]),actions]"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "#### Loss function and `update`\n",
172 |     "Here we write a function similar to `agent.update`."
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "#predict q-values for next states\n",
184 |     "predicted_next_qvalues = get_output(l_qvalues,{l_states:<theano input with for states>})\n",
185 |     "\n",
186 |     "\n",
187 |     "#Computing target q-values under \n",
188 |     "gamma = 0.99\n",
189 |     "target_qvalues_for_actions = <target Q-values using rewards and predicted_next_qvalues>\n",
190 |     "\n",
191 |     "#zero-out q-values at the end\n",
192 |     "target_qvalues_for_actions = (1-is_end)*target_qvalues_for_actions\n",
193 |     "\n",
194 |     "#don't compute gradient over target q-values (consider constant)\n",
195 |     "target_qvalues_for_actions = theano.gradient.disconnected_grad(target_qvalues_for_actions)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "collapsed": false
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "\n",
207 |     "#mean squared error loss function\n",
208 |     "loss = <mean squared between target_qvalues_for_actions and predicted_qvalues_for_actions>\n"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "collapsed": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "#all network weights\n",
220 |     "all_weights = get_all_params(l_qvalues,trainable=True)\n",
221 |     "\n",
222 |     "#network updates. Note the small learning rate (for stability)\n",
223 |     "updates = lasagne.updates.sgd(loss,all_weights,learning_rate=1e-4)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "collapsed": false
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "#Training function that resembles agent.update(state,action,reward,next_state) \n",
235 |     "#with 1 more argument meaning is_end\n",
236 |     "train_step = theano.function([current_states,actions,rewards,next_states,is_end],\n",
237 |     "                             updates=updates)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "### Playing the game"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {
251 |     "collapsed": false
252 |    },
253 |    "outputs": [],
254 |    "source": [
255 |     "epsilon = 0.25 #initial epsilon\n",
256 |     "\n",
257 |     "def generate_session(t_max=1000):\n",
258 |     "    \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n",
259 |     "    \n",
260 |     "    total_reward = 0\n",
261 |     "    s = env.reset()\n",
262 |     "    \n",
263 |     "    for t in range(t_max):\n",
264 |     "        \n",
265 |     "        #get action q-values from the network\n",
266 |     "        q_values = get_qvalues([s])[0] \n",
267 |     "        \n",
268 |     "        a = <sample action with epsilon-greedy strategy>\n",
269 |     "        \n",
270 |     "        new_s,r,done,info = env.step(a)\n",
271 |     "        \n",
272 |     "        #train agent one step. Note that we use one-element arrays instead of scalars \n",
273 |     "        #because that's what function accepts.\n",
274 |     "        train_step([s],[a],[r],[new_s],[done])\n",
275 |     "        \n",
276 |     "        total_reward+=r\n",
277 |     "        \n",
278 |     "        s = new_s\n",
279 |     "        if done: break\n",
280 |     "            \n",
281 |     "    return total_reward\n",
282 |     "        "
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {
289 |     "collapsed": false
290 |    },
291 |    "outputs": [],
292 |    "source": [
293 |     "for i in range(100):\n",
294 |     "    \n",
295 |     "    rewards = [generate_session() for _ in range(100)] #generate new sessions\n",
296 |     "    \n",
297 |     "    epsilon*=0.95\n",
298 |     "    \n",
299 |     "    print (\"mean reward:%.3f\\tepsilon:%.5f\"%(np.mean(rewards),epsilon))\n",
300 |     "\n",
301 |     "    if np.mean(rewards) > 300:\n",
302 |     "        print (\"You Win!\")\n",
303 |     "        break\n",
304 |     "        \n",
305 |     "    assert epsilon!=0, \"Please explore environment\""
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "### Video"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {
319 |     "collapsed": true
320 |    },
321 |    "outputs": [],
322 |    "source": [
323 |     "epsilon=0 #Don't forget to reset epsilon back to initial value if you want to go on training"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {
330 |     "collapsed": false
331 |    },
332 |    "outputs": [],
333 |    "source": [
334 |     "#record sessions\n",
335 |     "import gym.wrappers\n",
336 |     "env = gym.wrappers.Monitor(env,directory=\"videos\",force=True)\n",
337 |     "sessions = [generate_session() for _ in range(100)]\n",
338 |     "env.close()\n",
339 |     "#unwrap \n",
340 |     "env = env.env.env\n",
341 |     "#upload to gym\n",
342 |     "#gym.upload(\"./videos/\",api_key=\"<your_api_key>\") #you'll need me later\n",
343 |     "\n",
344 |     "#Warning! If you keep seeing error that reads something like\"DoubleWrapError\",\n",
345 |     "#run env=gym.make(\"CartPole-v0\");env.reset();"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {
352 |     "collapsed": false
353 |    },
354 |    "outputs": [],
355 |    "source": [
356 |     "#show video\n",
357 |     "from IPython.display import HTML\n",
358 |     "import os\n",
359 |     "\n",
360 |     "video_names = list(filter(lambda s:s.endswith(\".mp4\"),os.listdir(\"./videos/\")))\n",
361 |     "\n",
362 |     "HTML(\"\"\"\n",
363 |     "<video width=\"640\" height=\"480\" controls>\n",
364 |     "  <source src=\"{}\" type=\"video/mp4\">\n",
365 |     "</video>\n",
366 |     "\"\"\".format(\"./videos/\"+video_names[-1])) #this may or may not be _last_ video. Try other indices"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {
373 |     "collapsed": true
374 |    },
375 |    "outputs": [],
376 |    "source": []
377 |   }
378 |  ],
379 |  "metadata": {
380 |   "kernelspec": {
381 |    "display_name": "Python 3",
382 |    "language": "python",
383 |    "name": "python3"
384 |   },
385 |   "language_info": {
386 |    "codemirror_mode": {
387 |     "name": "ipython",
388 |     "version": 3
389 |    },
390 |    "file_extension": ".py",
391 |    "mimetype": "text/x-python",
392 |    "name": "python",
393 |    "nbconvert_exporter": "python",
394 |    "pygments_lexer": "ipython3",
395 |    "version": "3.6.0"
396 |   }
397 |  },
398 |  "nbformat": 4,
399 |  "nbformat_minor": 0
400 | }
401 | 


--------------------------------------------------------------------------------
/week2/assignment/graphicsCrawlerDisplay.py:
--------------------------------------------------------------------------------
  1 | # graphicsCrawlerDisplay.py
  2 | # -------------------------
  3 | # Licensing Information: Please do not distribute or publish solutions to this
  4 | # project. You are free to use and extend these projects for educational
  5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
  6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
  7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  8 | 
  9 | import Tkinter
 10 | import qlearningAgents
 11 | import time
 12 | import threading
 13 | import sys
 14 | import crawler
 15 | #import pendulum
 16 | import math
 17 | from math import pi as PI
 18 | 
 19 | robotType = 'crawler'
 20 | 
 21 | class Application:
 22 | 
 23 |     def sigmoid(self, x):
 24 |         return 1.0 / (1.0 + 2.0 ** (-x))
 25 | 
 26 |     def incrementSpeed(self, inc):
 27 |         self.tickTime *= inc        
 28 | #        self.epsilon = min(1.0, self.epsilon)
 29 | #        self.epsilon = max(0.0,self.epsilon)    
 30 | #        self.learner.setSpeed(self.epsilon)
 31 |         self.speed_label['text'] = 'Step Delay: %.5f' % (self.tickTime)
 32 | 
 33 |     def incrementEpsilon(self, inc):
 34 |         self.ep += inc     
 35 |         self.epsilon = self.sigmoid(self.ep)   
 36 |         self.learner.setEpsilon(self.epsilon)
 37 |         self.epsilon_label['text'] = 'Epsilon: %.3f' % (self.epsilon)
 38 |                            
 39 |     def incrementGamma(self, inc):
 40 |         self.ga += inc     
 41 |         self.gamma = self.sigmoid(self.ga)   
 42 |         self.learner.setDiscount(self.gamma)
 43 |         self.gamma_label['text'] = 'Discount: %.3f' % (self.gamma)
 44 |                 
 45 |     def incrementAlpha(self, inc):
 46 |         self.al += inc     
 47 |         self.alpha = self.sigmoid(self.al)   
 48 |         self.learner.setLearningRate(self.alpha)
 49 |         self.alpha_label['text'] = 'Learning Rate: %.3f' % (self.alpha)
 50 |         
 51 |     def __initGUI(self, win):
 52 |         ## Window ##
 53 |         self.win = win
 54 |         
 55 |         ## Initialize Frame ##    
 56 |         win.grid()
 57 |         self.dec = -.5
 58 |         self.inc = .5
 59 |         self.tickTime = 0.1
 60 | 
 61 |         ## Epsilon Button + Label ##
 62 |         self.setupSpeedButtonAndLabel(win)
 63 |         
 64 |         self.setupEpsilonButtonAndLabel(win)
 65 |         
 66 |         ## Gamma Button + Label ##
 67 |         self.setUpGammaButtonAndLabel(win)
 68 |         
 69 |         ## Alpha Button + Label ##
 70 |         self.setupAlphaButtonAndLabel(win)
 71 |         
 72 |         ## Exit Button ##
 73 |         #self.exit_button = Tkinter.Button(win,text='Quit', command=self.exit)
 74 |         #self.exit_button.grid(row=0, column=9)
 75 |         
 76 |         ## Simulation Buttons ##
 77 | #        self.setupSimulationButtons(win)
 78 |         
 79 |          ## Canvas ##
 80 |         self.canvas = Tkinter.Canvas(root, height=200, width=1000)
 81 |         self.canvas.grid(row=2,columnspan=10)
 82 | 
 83 |     def setupAlphaButtonAndLabel(self, win):
 84 |         self.alpha_minus = Tkinter.Button(win, 
 85 |         text="-",command=(lambda: self.incrementAlpha(self.dec)))                
 86 |         self.alpha_minus.grid(row=1, column=3, padx=10)
 87 |         
 88 |         self.alpha = self.sigmoid(self.al)
 89 |         self.alpha_label = Tkinter.Label(win, text='Learning Rate: %.3f' % (self.alpha))
 90 |         self.alpha_label.grid(row=1, column=4)
 91 |         
 92 |         self.alpha_plus = Tkinter.Button(win, 
 93 |         text="+",command=(lambda: self.incrementAlpha(self.inc)))        
 94 |         self.alpha_plus.grid(row=1, column=5, padx=10)
 95 | 
 96 |     def setUpGammaButtonAndLabel(self, win):
 97 |         self.gamma_minus = Tkinter.Button(win, 
 98 |         text="-",command=(lambda: self.incrementGamma(self.dec)))                
 99 |         self.gamma_minus.grid(row=1, column=0, padx=10)
100 |         
101 |         self.gamma = self.sigmoid(self.ga)   
102 |         self.gamma_label = Tkinter.Label(win, text='Discount: %.3f' % (self.gamma))
103 |         self.gamma_label.grid(row=1, column=1)
104 |         
105 |         self.gamma_plus = Tkinter.Button(win, 
106 |         text="+",command=(lambda: self.incrementGamma(self.inc)))        
107 |         self.gamma_plus.grid(row=1, column=2, padx=10)
108 | 
109 |     def setupEpsilonButtonAndLabel(self, win):
110 |         self.epsilon_minus = Tkinter.Button(win, 
111 |         text="-",command=(lambda: self.incrementEpsilon(self.dec)))                
112 |         self.epsilon_minus.grid(row=0, column=3)
113 |         
114 |         self.epsilon = self.sigmoid(self.ep)   
115 |         self.epsilon_label = Tkinter.Label(win, text='Epsilon: %.3f' % (self.epsilon))
116 |         self.epsilon_label.grid(row=0, column=4)
117 |         
118 |         self.epsilon_plus = Tkinter.Button(win, 
119 |         text="+",command=(lambda: self.incrementEpsilon(self.inc)))        
120 |         self.epsilon_plus.grid(row=0, column=5)
121 | 
122 |     def setupSpeedButtonAndLabel(self, win):
123 |         self.speed_minus = Tkinter.Button(win, 
124 |         text="-",command=(lambda: self.incrementSpeed(.5)))                
125 |         self.speed_minus.grid(row=0, column=0)
126 |         
127 |         self.speed_label = Tkinter.Label(win, text='Step Delay: %.5f' % (self.tickTime))
128 |         self.speed_label.grid(row=0, column=1)
129 |         
130 |         self.speed_plus = Tkinter.Button(win, 
131 |         text="+",command=(lambda: self.incrementSpeed(2)))        
132 |         self.speed_plus.grid(row=0, column=2)
133 | 
134 |         
135 |                            
136 |                 
137 |         
138 |         
139 |                                                
140 |     def skip5kSteps(self):
141 |         self.stepsToSkip = 5000
142 | 
143 |     def __init__(self, win):
144 |     
145 |         self.ep = 0
146 |         self.ga = 2
147 |         self.al = 2
148 |         self.stepCount = 0
149 |         ## Init Gui        
150 |             
151 |         self.__initGUI(win)
152 | 
153 |         # Init environment
154 |         if robotType == 'crawler':
155 |             self.robot = crawler.CrawlingRobot(self.canvas)                                        
156 |             self.robotEnvironment = crawler.CrawlingRobotEnvironment(self.robot)            
157 |         elif robotType == 'pendulum':
158 |             self.robot = pendulum.PendulumRobot(self.canvas)
159 |             self.robotEnvironment = \
160 |                 pendulum.PendulumRobotEnvironment(self.robot)
161 |         else:
162 |             raise "Unknown RobotType"
163 |   
164 |         # Init Agent      
165 |         simulationFn = lambda agent: \
166 |           simulation.SimulationEnvironment(self.robotEnvironment,agent)        
167 |         actionFn = lambda state: \
168 |           self.robotEnvironment.getPossibleActions(state)
169 |         self.learner = qlearningAgents.QLearningAgent(actionFn=actionFn)
170 |         
171 |         self.learner.setEpsilon(self.epsilon)
172 |         self.learner.setLearningRate(self.alpha)
173 |         self.learner.setDiscount(self.gamma)
174 |         
175 |         # Start GUI
176 |         self.running = True
177 |         self.stopped = False
178 |         self.stepsToSkip = 0
179 |         self.thread = threading.Thread(target=self.run)
180 |         self.thread.start()
181 | 
182 | 
183 |     def exit(self):
184 |       self.running = False
185 |       for i in range(5):
186 |         if not self.stopped:
187 | #          print "Waiting for thread to die..."
188 |           time.sleep(0.1)
189 |       self.win.destroy()      
190 |       sys.exit(0)
191 |       
192 |     def step(self):
193 |         
194 |         self.stepCount += 1
195 |         
196 |         state = self.robotEnvironment.getCurrentState()
197 |         actions = self.robotEnvironment.getPossibleActions(state)
198 |         if len(actions) == 0.0:
199 |             self.robotEnvironment.reset()
200 |             state = self.robotEnvironment.getCurrentState()
201 |             actions = self.robotEnvironment.getPossibleActions(state)        
202 |             print 'Reset!'
203 |         action = self.learner.getAction(state)
204 |         if action == None:
205 |             raise 'None action returned: Code Not Complete'
206 |         nextState, reward = self.robotEnvironment.doAction(action)
207 |         self.learner.observeTransition(state, action, nextState, reward)
208 |         
209 |     def animatePolicy(self):
210 |         if robotType != 'pendulum':
211 |             raise 'Only pendulum can animatePolicy'
212 |         
213 | 
214 |         totWidth = self.canvas.winfo_reqwidth()
215 |         totHeight = self.canvas.winfo_reqheight()
216 |         
217 |         length = 0.48 * min(totWidth, totHeight)
218 |         x,y = totWidth-length-30, length+10
219 |     
220 |         
221 |     
222 |         angleMin, angleMax = self.robot.getMinAndMaxAngle()
223 |         velMin, velMax = self.robot.getMinAndMaxAngleVelocity()
224 |         
225 |         if not 'animatePolicyBox' in dir(self):
226 |             self.canvas.create_line(x,y,x+length,y)
227 |             self.canvas.create_line(x+length,y,x+length,y-length)
228 |             self.canvas.create_line(x+length,y-length,x,y-length)
229 |             self.canvas.create_line(x,y-length,x,y)
230 |             self.animatePolicyBox = 1
231 |             self.canvas.create_text(x+length/2,y+10,text='angle')
232 |             self.canvas.create_text(x-30,y-length/2,text='velocity')
233 |             self.canvas.create_text(x-60,y-length/4,text='Blue = kickLeft')
234 |             self.canvas.create_text(x-60,y-length/4+20,text='Red = kickRight')
235 |             self.canvas.create_text(x-60,y-length/4+40,text='White = doNothing')
236 |             
237 |             
238 |         
239 |         angleDelta = (angleMax-angleMin) / 100
240 |         velDelta = (velMax-velMin) / 100
241 |         for i in range(100):
242 |             angle = angleMin + i * angleDelta
243 |  
244 |             for j in range(100):
245 |                 vel = velMin + j * velDelta
246 |                 state = self.robotEnvironment.getState(angle,vel)
247 |                 max, argMax = None, None
248 |                 if not self.learner.seenState(state):
249 |                     argMax = 'unseen'
250 |                 else:
251 |                      for action in ('kickLeft','kickRight','doNothing'):
252 |                          qVal = self.learner.getQValue(state, action)
253 |                          if max == None or qVal > max:
254 |                              max, argMax = qVal, action
255 |                 if argMax != 'unseen':
256 |                     if argMax == 'kickLeft':
257 |                         color = 'blue'
258 |                     elif argMax == 'kickRight':
259 |                         color = 'red'
260 |                     elif argMax == 'doNothing':
261 |                         color = 'white'
262 |                     dx = length / 100.0
263 |                     dy = length / 100.0
264 |                     x0, y0 = x+i*dx, y-j*dy
265 |                     self.canvas.create_rectangle(x0,y0,x0+dx,y0+dy,fill=color)
266 |                    
267 |                     
268 |                         
269 |         
270 |     def run(self):
271 |         self.stepCount = 0
272 |         self.learner.startEpisode()
273 |         while True:
274 |           minSleep = .01
275 |           tm = max(minSleep, self.tickTime)
276 |           time.sleep(tm)
277 |           self.stepsToSkip = int(tm / self.tickTime) - 1
278 | 
279 |           if not self.running:
280 |             self.stopped = True
281 |             return
282 |           for i in range(self.stepsToSkip):
283 |               self.step()  
284 |           self.stepsToSkip = 0       
285 |           self.step()
286 | #          self.robot.draw()
287 |         self.learner.stopEpisode()                                                      
288 |     
289 |     def start(self):
290 |         self.win.mainloop()
291 | 
292 | 
293 | 
294 | 
295 | 
296 | def run():
297 |   global root
298 |   root = Tkinter.Tk()
299 |   root.title( 'Crawler GUI' )
300 |   root.resizable( 0, 0 )
301 | 
302 | #  root.mainloop()
303 | 
304 | 
305 |   app = Application(root)
306 |   def update_gui():
307 |     app.robot.draw(app.stepCount, app.tickTime)
308 |     root.after(10, update_gui)
309 |   update_gui()
310 | 
311 |   root.protocol( 'WM_DELETE_WINDOW', app.exit)
312 |   app.start()
313 | 
314 | 


--------------------------------------------------------------------------------
/week3.5/Seminar3.5-approx-qlearning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Approximate q-learning\n",
  8 |     "\n",
  9 |     "In this notebook you will teach a lasagne neural network to do Q-learning."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "__Frameworks__ - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "#XVFB will be launched if you run on a server\n",
 28 |     "import os\n",
 29 |     "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\"))==0:\n",
 30 |     "    !bash ../xvfb start\n",
 31 |     "    %env DISPLAY=:1"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import gym\n",
 43 |     "import numpy as np, pandas as pd\n",
 44 |     "import matplotlib.pyplot as plt\n",
 45 |     "%matplotlib inline"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "collapsed": false,
 53 |     "scrolled": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "env = gym.make(\"CartPole-v0\")\n",
 58 |     "env.reset()\n",
 59 |     "n_actions = env.action_space.n\n",
 60 |     "state_dim = env.observation_space.shape\n",
 61 |     "\n",
 62 |     "plt.imshow(env.render(\"rgb_array\"))"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "# Approximate (deep) Q-learning: building the network\n",
 70 |     "\n",
 71 |     "In this section we will build and train naive Q-learning with theano/lasagne"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "First step is initializing input variables"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": false
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import theano\n",
 90 |     "import theano.tensor as T\n",
 91 |     "\n",
 92 |     "#create input variables. We'll support multiple states at once\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "current_states = T.matrix(\"states[batch,units]\")\n",
 96 |     "actions = T.ivector(\"action_ids[batch]\")\n",
 97 |     "rewards = T.vector(\"rewards[batch]\")\n",
 98 |     "next_states = T.matrix(\"next states[batch,units]\")\n",
 99 |     "is_end = T.ivector(\"vector[batch] where 1 means that session just ended\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "import lasagne\n",
111 |     "from lasagne.layers import *\n",
112 |     "\n",
113 |     "#input layer\n",
114 |     "l_states = InputLayer((None,)+state_dim)\n",
115 |     "\n",
116 |     "\n",
117 |     "<Your architecture. Please start with a single-layer network>\n",
118 |     "\n",
119 |     "\n",
120 |     "#output layer\n",
121 |     "l_qvalues = DenseLayer(<previous_layer>,num_units=n_actions,nonlinearity=None)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "#### Predicting Q-values for `current_states`"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "#get q-values for ALL actions in current_states\n",
140 |     "predicted_qvalues = get_output(l_qvalues,{l_states:current_states})"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "collapsed": true
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "#compiling agent's \"GetQValues\" function\n",
152 |     "get_qvalues = <compile a function that takes current_states and returns predicted_qvalues>"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "#select q-values for chosen actions\n",
164 |     "predicted_qvalues_for_actions = predicted_qvalues[T.arange(actions.shape[0]),actions]"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "#### Loss function and `update`\n",
172 |     "Here we write a function similar to `agent.update`."
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "#predict q-values for next states\n",
184 |     "predicted_next_qvalues = get_output(l_qvalues,{l_states:<theano input with for states>})\n",
185 |     "\n",
186 |     "\n",
187 |     "#Computing target q-values under \n",
188 |     "gamma = 0.99\n",
189 |     "target_qvalues_for_actions = <target Q-values using rewards and predicted_next_qvalues>\n",
190 |     "\n",
191 |     "#zero-out q-values at the end\n",
192 |     "target_qvalues_for_actions = (1-is_end)*target_qvalues_for_actions\n",
193 |     "\n",
194 |     "#don't compute gradient over target q-values (consider constant)\n",
195 |     "target_qvalues_for_actions = theano.gradient.disconnected_grad(target_qvalues_for_actions)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "collapsed": false
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "\n",
207 |     "#mean squared error loss function\n",
208 |     "loss = <mean squared between target_qvalues_for_actions and predicted_qvalues_for_actions>\n"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "collapsed": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "#all network weights\n",
220 |     "all_weights = get_all_params(l_qvalues,trainable=True)\n",
221 |     "\n",
222 |     "#network updates. Note the small learning rate (for stability)\n",
223 |     "updates = lasagne.updates.sgd(loss,all_weights,learning_rate=1e-4)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "collapsed": false
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "#Training function that resembles agent.update(state,action,reward,next_state) \n",
235 |     "#with 1 more argument meaning is_end\n",
236 |     "train_step = theano.function([current_states,actions,rewards,next_states,is_end],\n",
237 |     "                             updates=updates)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "### Playing the game"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {
251 |     "collapsed": false
252 |    },
253 |    "outputs": [],
254 |    "source": [
255 |     "epsilon = 0.25 #initial epsilon\n",
256 |     "\n",
257 |     "def generate_session(t_max=1000):\n",
258 |     "    \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n",
259 |     "    \n",
260 |     "    total_reward = 0\n",
261 |     "    s = env.reset()\n",
262 |     "    \n",
263 |     "    for t in range(t_max):\n",
264 |     "        \n",
265 |     "        #get action q-values from the network\n",
266 |     "        q_values = get_qvalues([s])[0] \n",
267 |     "        \n",
268 |     "        a = <sample action with epsilon-greedy strategy>\n",
269 |     "        \n",
270 |     "        new_s,r,done,info = env.step(a)\n",
271 |     "        \n",
272 |     "        #train agent one step. Note that we use one-element arrays instead of scalars \n",
273 |     "        #because that's what function accepts.\n",
274 |     "        train_step([s],[a],[r],[new_s],[done])\n",
275 |     "        \n",
276 |     "        total_reward+=r\n",
277 |     "        \n",
278 |     "        s = new_s\n",
279 |     "        if done: break\n",
280 |     "            \n",
281 |     "    return total_reward\n",
282 |     "        "
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {
289 |     "collapsed": false
290 |    },
291 |    "outputs": [],
292 |    "source": [
293 |     "for i in range(100):\n",
294 |     "    \n",
295 |     "    rewards = [generate_session() for _ in range(100)] #generate new sessions\n",
296 |     "    \n",
297 |     "    epsilon*=0.95\n",
298 |     "    \n",
299 |     "    print (\"mean reward:%.3f\\tepsilon:%.5f\"%(np.mean(rewards),epsilon))\n",
300 |     "\n",
301 |     "    if np.mean(rewards) > 300:\n",
302 |     "        print (\"You Win!\")\n",
303 |     "        break\n",
304 |     "        \n",
305 |     "    assert epsilon!=0, \"Please explore environment\""
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "### Video"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {
319 |     "collapsed": true
320 |    },
321 |    "outputs": [],
322 |    "source": [
323 |     "epsilon=0 #Don't forget to reset epsilon back to initial value if you want to go on training"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {
330 |     "collapsed": false
331 |    },
332 |    "outputs": [],
333 |    "source": [
334 |     "#record sessions\n",
335 |     "import gym.wrappers\n",
336 |     "env = gym.wrappers.Monitor(env,directory=\"videos\",force=True)\n",
337 |     "sessions = [generate_session() for _ in range(100)]\n",
338 |     "env.close()\n",
339 |     "#unwrap \n",
340 |     "env = env.env.env\n",
341 |     "#upload to gym\n",
342 |     "#gym.upload(\"./videos/\",api_key=\"<your_api_key>\") #you'll need me later\n",
343 |     "\n",
344 |     "#Warning! If you keep seeing error that reads something like\"DoubleWrapError\",\n",
345 |     "#run env=gym.make(\"CartPole-v0\");env.reset();"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {
352 |     "collapsed": false
353 |    },
354 |    "outputs": [],
355 |    "source": [
356 |     "#show video\n",
357 |     "from IPython.display import HTML\n",
358 |     "import os\n",
359 |     "\n",
360 |     "video_names = list(filter(lambda s:s.endswith(\".mp4\"),os.listdir(\"./videos/\")))\n",
361 |     "\n",
362 |     "HTML(\"\"\"\n",
363 |     "<video width=\"640\" height=\"480\" controls>\n",
364 |     "  <source src=\"{}\" type=\"video/mp4\">\n",
365 |     "</video>\n",
366 |     "\"\"\".format(\"./videos/\"+video_names[-1])) #this may or may not be _last_ video. Try other indices"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {
372 |     "collapsed": true
373 |    },
374 |    "source": [
375 |     "### Homework\n",
376 |     "\n",
377 |     "Two paths lie ahead of you, and which one to take is a rightfull choice of yours.\n",
378 |     "\n",
379 |     "* __[recommended]__ Go deeper. Return to seminar1 and get 99% accuracy on MNIST\n",
380 |     "* __[alternative]__ Try approximate expected-value SARSA and other algorithms and compare it with q-learning \n",
381 |     "  * +3 points for EV-SARSA and comparison to Q-learning\n",
382 |     "  * +2 per additional algorithm\n",
383 |     "* __[alternative hard]__ Pick ```<your favourite env>``` and solve it, using NN.\n",
384 |     " * LunarLander, MountainCar or Breakout (from week1 bonus)\n",
385 |     " * LunarLander should get at least +100\n",
386 |     " * MountainCar should get at least -200\n",
387 |     " * You will need to somehow stabilize learning\n",
388 |     "   \n"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {
395 |     "collapsed": true
396 |    },
397 |    "outputs": [],
398 |    "source": []
399 |   }
400 |  ],
401 |  "metadata": {
402 |   "kernelspec": {
403 |    "display_name": "Python [Root]",
404 |    "language": "python",
405 |    "name": "Python [Root]"
406 |   },
407 |   "language_info": {
408 |    "codemirror_mode": {
409 |     "name": "ipython",
410 |     "version": 2
411 |    },
412 |    "file_extension": ".py",
413 |    "mimetype": "text/x-python",
414 |    "name": "python",
415 |    "nbconvert_exporter": "python",
416 |    "pygments_lexer": "ipython2",
417 |    "version": "2.7.12"
418 |   }
419 |  },
420 |  "nbformat": 4,
421 |  "nbformat_minor": 0
422 | }
423 | 


--------------------------------------------------------------------------------
/week2/assignment/graphicsUtils.py:
--------------------------------------------------------------------------------
  1 | # graphicsUtils.py
  2 | # ----------------
  3 | # Licensing Information: Please do not distribute or publish solutions to this
  4 | # project. You are free to use and extend these projects for educational
  5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
  6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
  7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  8 | 
  9 | import sys
 10 | import math
 11 | import random
 12 | import string
 13 | import time
 14 | import types
 15 | import Tkinter
 16 | 
 17 | _Windows = sys.platform == 'win32'  # True if on Win95/98/NT
 18 | 
 19 | _root_window = None      # The root window for graphics output
 20 | _canvas = None      # The canvas which holds graphics
 21 | _canvas_xs = None      # Size of canvas object
 22 | _canvas_ys = None
 23 | _canvas_x = None      # Current position on canvas
 24 | _canvas_y = None
 25 | _canvas_col = None      # Current colour (set to black below)
 26 | _canvas_tsize = 12
 27 | _canvas_tserifs = 0
 28 | 
 29 | def formatColor(r, g, b):
 30 |   return '#%02x%02x%02x' % (int(r * 255), int(g * 255), int(b * 255))
 31 | 
 32 | def colorToVector(color):
 33 |   return map(lambda x: int(x, 16) / 256.0, [color[1:3], color[3:5], color[5:7]])
 34 | 
 35 | if _Windows:
 36 |     _canvas_tfonts = ['times new roman', 'lucida console']
 37 | else:
 38 |     _canvas_tfonts = ['times', 'lucidasans-24']
 39 |     pass # XXX need defaults here
 40 | 
 41 | def sleep(secs):
 42 |     global _root_window
 43 |     if _root_window == None:
 44 |         time.sleep(secs)
 45 |     else:
 46 |         _root_window.update_idletasks()
 47 |         _root_window.after(int(1000 * secs), _root_window.quit)
 48 |         _root_window.mainloop()
 49 | 
 50 | def begin_graphics(width=640, height=480, color=formatColor(0, 0, 0), title=None):
 51 | 
 52 |     global _root_window, _canvas, _canvas_x, _canvas_y, _canvas_xs, _canvas_ys, _bg_color
 53 | 
 54 |     # Check for duplicate call
 55 |     if _root_window is not None:
 56 |         # Lose the window.
 57 |         _root_window.destroy()
 58 |         
 59 |     # Save the canvas size parameters
 60 |     _canvas_xs, _canvas_ys = width - 1, height - 1
 61 |     _canvas_x, _canvas_y = 0, _canvas_ys
 62 |     _bg_color = color
 63 |     
 64 |     # Create the root window
 65 |     _root_window = Tkinter.Tk()
 66 |     _root_window.protocol('WM_DELETE_WINDOW', _destroy_window)
 67 |     _root_window.title(title or 'Graphics Window')
 68 |     _root_window.resizable(0, 0)
 69 | 
 70 |     # Create the canvas object
 71 |     try:
 72 |       _canvas = Tkinter.Canvas(_root_window, width=width, height=height)
 73 |       _canvas.pack()
 74 |       draw_background()
 75 |       _canvas.update()
 76 |     except:
 77 |       _root_window = None
 78 |       raise
 79 | 
 80 |     # Bind to key-down and key-up events
 81 |     _root_window.bind( "<KeyPress>", _keypress )
 82 |     _root_window.bind( "<KeyRelease>", _keyrelease )
 83 |     _root_window.bind( "<FocusIn>", _clear_keys )
 84 |     _root_window.bind( "<FocusOut>", _clear_keys )
 85 |     _root_window.bind( "<Button-1>", _leftclick )
 86 |     _root_window.bind( "<Button-2>", _rightclick )
 87 |     _root_window.bind( "<Button-3>", _rightclick )
 88 |     _root_window.bind( "<Control-Button-1>", _ctrl_leftclick)
 89 |     _clear_keys()
 90 | 
 91 | _leftclick_loc = None
 92 | _rightclick_loc = None
 93 | _ctrl_leftclick_loc = None
 94 | 
 95 | def _leftclick(event):
 96 |   global _leftclick_loc
 97 |   _leftclick_loc = (event.x, event.y)
 98 | 
 99 | def _rightclick(event):
100 |   global _rightclick_loc
101 |   _rightclick_loc = (event.x, event.y)
102 | 
103 | def _ctrl_leftclick(event):
104 |   global _ctrl_leftclick_loc
105 |   _ctrl_leftclick_loc = (event.x, event.y)
106 | 
107 | def wait_for_click():
108 |   while True:
109 |     global _leftclick_loc
110 |     global _rightclick_loc
111 |     global _ctrl_leftclick_loc
112 |     if _leftclick_loc != None:
113 |       val = _leftclick_loc
114 |       _leftclick_loc = None
115 |       return val, 'left'
116 |     if _rightclick_loc != None:
117 |       val = _rightclick_loc
118 |       _rightclick_loc = None
119 |       return val, 'right'
120 |     if _ctrl_leftclick_loc != None:
121 |       val = _ctrl_leftclick_loc
122 |       _ctrl_leftclick_loc = None
123 |       return val, 'ctrl_left'
124 |     sleep(0.05)  
125 | 
126 | def draw_background():
127 |     corners = [(0,0), (0, _canvas_ys), (_canvas_xs, _canvas_ys), (_canvas_xs, 0)]
128 |     polygon(corners, _bg_color, fillColor=_bg_color, filled=True, smoothed=False)
129 |     
130 | def _destroy_window(event=None):
131 |     sys.exit(0)
132 | #    global _root_window
133 | #    _root_window.destroy()
134 | #    _root_window = None
135 |     #print "DESTROY"
136 | 
137 | def end_graphics():
138 |     global _root_window, _canvas, _mouse_enabled
139 |     try:
140 |       try:
141 |         sleep(1)
142 |         if _root_window != None: 
143 |           _root_window.destroy()
144 |       except SystemExit, e:
145 |         print 'Ending graphics raised an exception:', e
146 |     finally:
147 |       _root_window = None
148 |       _canvas = None
149 |       _mouse_enabled = 0
150 |       _clear_keys()
151 | 
152 | def clear_screen(background=None):
153 |     global _canvas_x, _canvas_y
154 |     _canvas.delete('all')
155 |     draw_background()
156 |     _canvas_x, _canvas_y = 0, _canvas_ys
157 | 
158 | def polygon(coords, outlineColor, fillColor=None, filled=1, smoothed=1, behind=0, width=1):
159 |   c = []
160 |   for coord in coords:
161 |     c.append(coord[0])
162 |     c.append(coord[1])
163 |   if fillColor == None: fillColor = outlineColor
164 |   if filled == 0: fillColor = ""
165 |   poly = _canvas.create_polygon(c, outline=outlineColor, fill=fillColor, smooth=smoothed, width=width)
166 |   if behind > 0:    
167 |     _canvas.tag_lower(poly, behind) # Higher should be more visible
168 |   return poly
169 |   
170 | def square(pos, r, color, filled=1, behind=0):
171 |   x, y = pos
172 |   coords = [(x - r, y - r), (x + r, y - r), (x + r, y + r), (x - r, y + r)]
173 |   return polygon(coords, color, color, filled, 0, behind=behind)
174 | 
175 | def circle(pos, r, outlineColor, fillColor, endpoints=None, style='pieslice', width=2):
176 |     x, y = pos
177 |     x0, x1 = x - r - 1, x + r
178 |     y0, y1 = y - r - 1, y + r
179 |     if endpoints == None:
180 |       e = [0, 359]
181 |     else:
182 |       e = list(endpoints)
183 |     while e[0] > e[1]: e[1] = e[1] + 360
184 | 
185 |     return _canvas.create_arc(x0, y0, x1, y1, outline=outlineColor, fill=fillColor,
186 |                               extent=e[1] - e[0], start=e[0], style=style, width=width)
187 | 
188 | def image(pos, file="../../blueghost.gif"):
189 |     x, y = pos
190 |     # img = PhotoImage(file=file)
191 |     return _canvas.create_image(x, y, image = Tkinter.PhotoImage(file=file), anchor = Tkinter.NW)
192 |     
193 |     
194 | def refresh():
195 |       _canvas.update_idletasks()
196 |                                                     
197 | def moveCircle(id, pos, r, endpoints=None):
198 |     global _canvas_x, _canvas_y
199 |     
200 |     x, y = pos
201 | #    x0, x1 = x - r, x + r + 1
202 | #    y0, y1 = y - r, y + r + 1
203 |     x0, x1 = x - r - 1, x + r
204 |     y0, y1 = y - r - 1, y + r
205 |     if endpoints == None:
206 |       e = [0, 359]
207 |     else:
208 |       e = list(endpoints)
209 |     while e[0] > e[1]: e[1] = e[1] + 360
210 | 
211 |     edit(id, ('start', e[0]), ('extent', e[1] - e[0]))
212 |     move_to(id, x0, y0)
213 | 
214 | def edit(id, *args):
215 |     _canvas.itemconfigure(id, **dict(args))
216 |     
217 | def text(pos, color, contents, font='Helvetica', size=12, style='normal', anchor="nw"):
218 |     global _canvas_x, _canvas_y
219 |     x, y = pos
220 |     font = (font, str(size), style)
221 |     return _canvas.create_text(x, y, fill=color, text=contents, font=font, anchor=anchor)
222 | 
223 | def changeText(id, newText, font=None, size=12, style='normal'):
224 |   _canvas.itemconfigure(id, text=newText)
225 |   if font != None:
226 |     _canvas.itemconfigure(id, font=(font, '-%d' % size, style))
227 | 
228 | def changeColor(id, newColor):
229 |   _canvas.itemconfigure(id, fill=newColor)
230 | 
231 | def line(here, there, color=formatColor(0, 0, 0), width=2):
232 |   x0, y0 = here[0], here[1]
233 |   x1, y1 = there[0], there[1]
234 |   return _canvas.create_line(x0, y0, x1, y1, fill=color, width=width)
235 | 
236 | ##############################################################################
237 | ### Keypress handling ########################################################
238 | ##############################################################################
239 | 
240 | # We bind to key-down and key-up events.
241 | 
242 | _keysdown = {}
243 | _keyswaiting = {}
244 | # This holds an unprocessed key release.  We delay key releases by up to
245 | # one call to keys_pressed() to get round a problem with auto repeat.
246 | _got_release = None
247 | 
248 | def _keypress(event):
249 |     global _got_release
250 |     #remap_arrows(event)
251 |     _keysdown[event.keysym] = 1
252 |     _keyswaiting[event.keysym] = 1
253 | #    print event.char, event.keycode
254 |     _got_release = None
255 | 
256 | def _keyrelease(event):
257 |     global _got_release
258 |     #remap_arrows(event)
259 |     try:
260 |       del _keysdown[event.keysym]
261 |     except:
262 |       pass
263 |     _got_release = 1
264 |     
265 | def remap_arrows(event):
266 |     # TURN ARROW PRESSES INTO LETTERS (SHOULD BE IN KEYBOARD AGENT)
267 |     if event.char in ['a', 's', 'd', 'w']:
268 |       return
269 |     if event.keycode in [37, 101]: # LEFT ARROW (win / x)
270 |       event.char = 'a'
271 |     if event.keycode in [38, 99]: # UP ARROW
272 |       event.char = 'w'
273 |     if event.keycode in [39, 102]: # RIGHT ARROW
274 |       event.char = 'd'
275 |     if event.keycode in [40, 104]: # DOWN ARROW
276 |       event.char = 's'
277 | 
278 | def _clear_keys(event=None):
279 |     global _keysdown, _got_release, _keyswaiting
280 |     _keysdown = {}
281 |     _keyswaiting = {}
282 |     _got_release = None
283 | 
284 | def keys_pressed(d_o_e=Tkinter.tkinter.dooneevent,
285 |                  d_w=Tkinter.tkinter.DONT_WAIT):
286 |     d_o_e(d_w)
287 |     if _got_release:
288 |       d_o_e(d_w)
289 |     return _keysdown.keys()
290 |   
291 | def keys_waiting():
292 |   global _keyswaiting
293 |   keys = _keyswaiting.keys()
294 |   _keyswaiting = {}
295 |   return keys
296 | 
297 | # Block for a list of keys...
298 | 
299 | def wait_for_keys():
300 |     keys = []
301 |     while keys == []:
302 |         keys = keys_pressed()
303 |         sleep(0.05)
304 |     return keys
305 | 
306 | def remove_from_screen(x,
307 |                        d_o_e=Tkinter.tkinter.dooneevent,
308 |                        d_w=Tkinter.tkinter.DONT_WAIT):
309 |     _canvas.delete(x)
310 |     d_o_e(d_w)
311 | 
312 | def _adjust_coords(coord_list, x, y):
313 |     for i in range(0, len(coord_list), 2):
314 |         coord_list[i] = coord_list[i] + x
315 |         coord_list[i + 1] = coord_list[i + 1] + y
316 |     return coord_list
317 | 
318 | def move_to(object, x, y=None,
319 |             d_o_e=Tkinter.tkinter.dooneevent,
320 |             d_w=Tkinter.tkinter.DONT_WAIT):
321 |     if y is None:
322 |         try: x, y = x
323 |         except: raise  'incomprehensible coordinates' 
324 |         
325 |     horiz = True
326 |     newCoords = []
327 |     current_x, current_y = _canvas.coords(object)[0:2] # first point
328 |     for coord in  _canvas.coords(object):
329 |       if horiz:  
330 |         inc = x - current_x
331 |       else:      
332 |         inc = y - current_y
333 |       horiz = not horiz
334 |       
335 |       newCoords.append(coord + inc)
336 |     
337 |     _canvas.coords(object, *newCoords)
338 |     d_o_e(d_w)
339 |     
340 | def move_by(object, x, y=None,
341 |             d_o_e=Tkinter.tkinter.dooneevent,
342 |             d_w=Tkinter.tkinter.DONT_WAIT):
343 |     if y is None:
344 |         try: x, y = x
345 |         except: raise Exception, 'incomprehensible coordinates' 
346 |     
347 |     horiz = True
348 |     newCoords = []
349 |     for coord in  _canvas.coords(object):
350 |       if horiz:  
351 |         inc = x
352 |       else:      
353 |         inc = y
354 |       horiz = not horiz
355 |       
356 |       newCoords.append(coord + inc)
357 |       
358 |     _canvas.coords(object, *newCoords)
359 |     d_o_e(d_w)
360 |     
361 | def writePostscript(filename):
362 |   "Writes the current canvas to a postscript file."    
363 |   psfile = file(filename, 'w')
364 |   psfile.write(_canvas.postscript(pageanchor='sw',
365 |                    y='0.c', 
366 |                    x='0.c'))
367 |   psfile.close()
368 |   
369 | ghost_shape = [                
370 |     (0, - 0.5),
371 |     (0.25, - 0.75),
372 |     (0.5, - 0.5),
373 |     (0.75, - 0.75),
374 |     (0.75, 0.5),
375 |     (0.5, 0.75),
376 |     (- 0.5, 0.75),
377 |     (- 0.75, 0.5),
378 |     (- 0.75, - 0.75),
379 |     (- 0.5, - 0.5),
380 |     (- 0.25, - 0.75)
381 |   ]
382 | 
383 | if __name__ == '__main__':
384 |   begin_graphics()
385 |   clear_screen()
386 |   ghost_shape = [(x * 10 + 20, y * 10 + 20) for x, y in ghost_shape]
387 |   g = polygon(ghost_shape, formatColor(1, 1, 1))
388 |   move_to(g, (50, 50))
389 |   circle((150, 150), 20, formatColor(0.7, 0.3, 0.0), endpoints=[15, - 15])
390 |   sleep(2)


--------------------------------------------------------------------------------
/week2/assignment/textGridworldDisplay.py:
--------------------------------------------------------------------------------
  1 | # textGridworldDisplay.py
  2 | # -----------------------
  3 | # Licensing Information: Please do not distribute or publish solutions to this
  4 | # project. You are free to use and extend these projects for educational
  5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
  6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
  7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  8 | 
  9 | import util
 10 | 
 11 | class TextGridworldDisplay:
 12 |   
 13 |   def __init__(self, gridworld):
 14 |     self.gridworld = gridworld
 15 |   
 16 |   def start(self):
 17 |     pass
 18 |   
 19 |   def pause(self):
 20 |     pass
 21 |   
 22 |   def displayValues(self, agent, currentState = None, message = None):
 23 |     if message != None:
 24 |       print message
 25 |     values = util.Counter()
 26 |     policy = {}
 27 |     states = self.gridworld.getStates()
 28 |     for state in states:
 29 |       values[state] = agent.getValue(state)
 30 |       policy[state] = agent.getPolicy(state)
 31 |     prettyPrintValues(self.gridworld, values, policy, currentState)
 32 |   
 33 |   def displayNullValues(self, agent, currentState = None, message = None):
 34 |     if message != None: print message
 35 |     prettyPrintNullValues(self.gridworld, currentState)
 36 | 
 37 |   def displayQValues(self, agent, currentState = None, message = None):
 38 |     if message != None: print message
 39 |     qValues = util.Counter()
 40 |     states = self.gridworld.getStates()
 41 |     for state in states:
 42 |       for action in self.gridworld.getPossibleActions(state):
 43 |         qValues[(state, action)] = agent.getQValue(state, action)
 44 |     prettyPrintQValues(self.gridworld, qValues, currentState)
 45 | 
 46 | 
 47 | def prettyPrintValues(gridWorld, values, policy=None, currentState = None):
 48 |   grid = gridWorld.grid
 49 |   maxLen = 11
 50 |   newRows = []
 51 |   for y in range(grid.height):
 52 |     newRow = []
 53 |     for x in range(grid.width):
 54 |       state = (x, y)
 55 |       value = values[state]
 56 |       action = None
 57 |       if policy != None and state in policy:
 58 |         action = policy[state]          
 59 |       actions = gridWorld.getPossibleActions(state)        
 60 |       if action not in actions and 'exit' in actions:
 61 |         action = 'exit'
 62 |       valString = None
 63 |       if action == 'exit':
 64 |         valString = border('%.2f' % value)
 65 |       else:
 66 |         valString = '\n\n%.2f\n\n' % value
 67 |         valString += ' '*maxLen
 68 |       if grid[x][y] == 'S':
 69 |         valString = '\n\nS: %.2f\n\n'  % value
 70 |         valString += ' '*maxLen        
 71 |       if grid[x][y] == '#':
 72 |         valString = '\n#####\n#####\n#####\n'
 73 |         valString += ' '*maxLen
 74 |       pieces = [valString]                
 75 |       text = ("\n".join(pieces)).split('\n')        
 76 |       if currentState == state:
 77 |         l = len(text[1])
 78 |         if l == 0:
 79 |           text[1] = '*'
 80 |         else:
 81 |           text[1] = "|" + ' ' * int((l-1)/2-1) + '*' + ' ' * int((l)/2-1) + "|"       
 82 |       if action == 'east':
 83 |         text[2] = '  ' + text[2]  + ' >'
 84 |       elif action == 'west':
 85 |         text[2] = '< ' + text[2]  + '  '
 86 |       elif action == 'north':
 87 |         text[0] = ' ' * int(maxLen/2) + '^' +' ' * int(maxLen/2)
 88 |       elif action == 'south':
 89 |         text[4] = ' ' * int(maxLen/2) + 'v' +' ' * int(maxLen/2)
 90 |       newCell = "\n".join(text)
 91 |       newRow.append(newCell)
 92 |     newRows.append(newRow)
 93 |   numCols = grid.width
 94 |   for rowNum, row in enumerate(newRows):
 95 |     row.insert(0,"\n\n"+str(rowNum))
 96 |   newRows.reverse()
 97 |   colLabels = [str(colNum) for colNum in range(numCols)]
 98 |   colLabels.insert(0,' ')
 99 |   finalRows = [colLabels] + newRows
100 |   print indent(finalRows,separateRows=True,delim='|', prefix='|',postfix='|', justify='center',hasHeader=True)
101 | 
102 | 
103 | def prettyPrintNullValues(gridWorld, currentState = None):
104 |     grid = gridWorld.grid
105 |     maxLen = 11
106 |     newRows = []
107 |     for y in range(grid.height):
108 |       newRow = []
109 |       for x in range(grid.width):
110 |         state = (x, y)
111 | 
112 |         # value = values[state]
113 | 
114 |         action = None
115 |         # if policy != None and state in policy:
116 |         #   action = policy[state]
117 |         # 
118 |         actions = gridWorld.getPossibleActions(state)
119 | 
120 |         if action not in actions and 'exit' in actions:
121 |           action = 'exit'
122 | 
123 |         valString = None
124 |         # if action == 'exit':
125 |         #   valString = border('%.2f' % value)
126 |         # else:
127 |         #   valString = '\n\n%.2f\n\n' % value
128 |         #   valString += ' '*maxLen
129 | 
130 |         if grid[x][y] == 'S':
131 |           valString = '\n\nS\n\n'
132 |           valString += ' '*maxLen
133 |         elif grid[x][y] == '#':
134 |           valString = '\n#####\n#####\n#####\n'
135 |           valString += ' '*maxLen
136 |         elif type(grid[x][y]) == float or type(grid[x][y]) == int:
137 |           valString = border('%.2f' % float(grid[x][y]))
138 |         else: valString = border('  ')
139 |         pieces = [valString]
140 | 
141 |         text = ("\n".join(pieces)).split('\n')
142 | 
143 |         if currentState == state:
144 |           l = len(text[1])
145 |           if l == 0:
146 |             text[1] = '*'
147 |           else:
148 |             text[1] = "|" + ' ' * int((l-1)/2-1) + '*' + ' ' * int((l)/2-1) + "|"
149 | 
150 |         if action == 'east':
151 |           text[2] = '  ' + text[2]  + ' >'
152 |         elif action == 'west':
153 |           text[2] = '< ' + text[2]  + '  '
154 |         elif action == 'north':
155 |           text[0] = ' ' * int(maxLen/2) + '^' +' ' * int(maxLen/2)
156 |         elif action == 'south':
157 |           text[4] = ' ' * int(maxLen/2) + 'v' +' ' * int(maxLen/2)
158 |         newCell = "\n".join(text)
159 |         newRow.append(newCell)
160 |       newRows.append(newRow)
161 |     numCols = grid.width
162 |     for rowNum, row in enumerate(newRows):
163 |       row.insert(0,"\n\n"+str(rowNum))
164 |     newRows.reverse()
165 |     colLabels = [str(colNum) for colNum in range(numCols)]
166 |     colLabels.insert(0,' ')
167 |     finalRows = [colLabels] + newRows
168 |     print indent(finalRows,separateRows=True,delim='|', prefix='|',postfix='|', justify='center',hasHeader=True)
169 |   
170 | def prettyPrintQValues(gridWorld, qValues, currentState=None):
171 |     grid = gridWorld.grid
172 |     maxLen = 11
173 |     newRows = []
174 |     for y in range(grid.height):
175 |       newRow = []
176 |       for x in range(grid.width):
177 |         state = (x, y)
178 |         actions = gridWorld.getPossibleActions(state)
179 |         if actions == None or len(actions) == 0:
180 |           actions = [None]
181 |         bestQ = max([qValues[(state, action)] for action in actions])
182 |         bestActions = [action for action in actions if qValues[(state, action)] == bestQ]
183 |     
184 |         # display cell
185 |         qStrings = dict([(action, "%.2f" % qValues[(state, action)]) for action in actions])
186 |         northString = ('north' in qStrings and qStrings['north']) or ' '
187 |         southString = ('south' in qStrings and qStrings['south']) or ' '
188 |         eastString = ('east' in qStrings and qStrings['east']) or ' '
189 |         westString = ('west' in qStrings and qStrings['west']) or ' '
190 |         exitString = ('exit' in qStrings and qStrings['exit']) or ' '
191 | 
192 |         eastLen = len(eastString)
193 |         westLen = len(westString)
194 |         if eastLen < westLen:
195 |           eastString = ' '*(westLen-eastLen)+eastString
196 |         if westLen < eastLen:
197 |           westString = westString+' '*(eastLen-westLen)
198 |     
199 |         if 'north' in bestActions:
200 |           northString = '/'+northString+'\\'
201 |         if 'south' in bestActions:
202 |           southString = '\\'+southString+'/'
203 |         if 'east' in bestActions:
204 |           eastString = ''+eastString+'>'
205 |         else:
206 |           eastString = ''+eastString+' '
207 |         if 'west' in bestActions:
208 |           westString = '<'+westString+''
209 |         else:
210 |           westString = ' '+westString+''
211 |         if 'exit' in bestActions:
212 |           exitString = '[ '+exitString+' ]'
213 | 
214 |     
215 |         ewString = westString + "     " + eastString
216 |         if state == currentState:
217 |           ewString = westString + "  *  " + eastString
218 |         if state == gridWorld.getStartState():
219 |           ewString = westString + "  S  " + eastString
220 |         if state == currentState and state == gridWorld.getStartState():
221 |           ewString = westString + " S:* " + eastString
222 |     
223 |         text = [northString, "\n"+exitString, ewString, ' '*maxLen+"\n", southString]
224 |     
225 |         if grid[x][y] == '#':
226 |           text = ['', '\n#####\n#####\n#####', '']
227 |     
228 |         newCell = "\n".join(text)
229 |         newRow.append(newCell)
230 |       newRows.append(newRow)
231 |     numCols = grid.width
232 |     for rowNum, row in enumerate(newRows):
233 |       row.insert(0,"\n\n\n"+str(rowNum))
234 |     newRows.reverse()
235 |     colLabels = [str(colNum) for colNum in range(numCols)]
236 |     colLabels.insert(0,' ')
237 |     finalRows = [colLabels] + newRows
238 | 
239 |     print indent(finalRows,separateRows=True,delim='|',prefix='|',postfix='|', justify='center',hasHeader=True)
240 | 
241 | def border(text):    
242 |   length = len(text)
243 |   pieces = ['-' * (length+2), '|'+' ' * (length+2)+'|', ' | '+text+' | ', '|'+' ' * (length+2)+'|','-' * (length+2)]
244 |   return '\n'.join(pieces)
245 |     
246 | # INDENTING CODE
247 | 
248 | # Indenting code based on a post from George Sakkis
249 | # (http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/267662)
250 | 
251 | import cStringIO,operator
252 | 
253 | def indent(rows, hasHeader=False, headerChar='-', delim=' | ', justify='left',
254 |            separateRows=False, prefix='', postfix='', wrapfunc=lambda x:x):
255 |     """Indents a table by column.
256 |        - rows: A sequence of sequences of items, one sequence per row.
257 |        - hasHeader: True if the first row consists of the columns' names.
258 |        - headerChar: Character to be used for the row separator line
259 |          (if hasHeader==True or separateRows==True).
260 |        - delim: The column delimiter.
261 |        - justify: Determines how are data justified in their column. 
262 |          Valid values are 'left','right' and 'center'.
263 |        - separateRows: True if rows are to be separated by a line
264 |          of 'headerChar's.
265 |        - prefix: A string prepended to each printed row.
266 |        - postfix: A string appended to each printed row.
267 |        - wrapfunc: A function f(text) for wrapping text; each element in
268 |          the table is first wrapped by this function."""
269 |     # closure for breaking logical rows to physical, using wrapfunc
270 |     def rowWrapper(row):
271 |         newRows = [wrapfunc(item).split('\n') for item in row]
272 |         return [[substr or '' for substr in item] for item in map(None,*newRows)]
273 |     # break each logical row into one or more physical ones
274 |     logicalRows = [rowWrapper(row) for row in rows]
275 |     # columns of physical rows
276 |     columns = map(None,*reduce(operator.add,logicalRows))
277 |     # get the maximum of each column by the string length of its items
278 |     maxWidths = [max([len(str(item)) for item in column]) for column in columns]
279 |     rowSeparator = headerChar * (len(prefix) + len(postfix) + sum(maxWidths) + \
280 |                                  len(delim)*(len(maxWidths)-1))
281 |     # select the appropriate justify method
282 |     justify = {'center':str.center, 'right':str.rjust, 'left':str.ljust}[justify.lower()]
283 |     output=cStringIO.StringIO()
284 |     if separateRows: print >> output, rowSeparator
285 |     for physicalRows in logicalRows:
286 |         for row in physicalRows:
287 |             print >> output, \
288 |                 prefix \
289 |                 + delim.join([justify(str(item),width) for (item,width) in zip(row,maxWidths)]) \
290 |                 + postfix
291 |         if separateRows or hasHeader: print >> output, rowSeparator; hasHeader=False
292 |     return output.getvalue()
293 |     
294 | import math
295 | def wrap_always(text, width):
296 |     """A simple word-wrap function that wraps text on exactly width characters.
297 |        It doesn't split the text in words."""
298 |     return '\n'.join([ text[width*i:width*(i+1)] \
299 |                        for i in xrange(int(math.ceil(1.*len(text)/width))) ])
300 |     
301 |     
302 | # TEST OF DISPLAY CODE
303 |                                 
304 | if __name__ == '__main__':
305 |   import gridworld, util
306 | 
307 |   grid = gridworld.getCliffGrid3()
308 |   print grid.getStates()
309 |   
310 |   policy = dict([(state,'east') for state in grid.getStates()])
311 |   values = util.Counter(dict([(state,1000.23) for state in grid.getStates()]))
312 |   prettyPrintValues(grid, values, policy, currentState = (0,0))
313 | 
314 |   stateCrossActions = [[(state, action) for action in grid.getPossibleActions(state)] for state in grid.getStates()]
315 |   qStates = reduce(lambda x,y: x+y, stateCrossActions, [])
316 |   qValues = util.Counter(dict([((state, action), 10.5) for state, action in qStates]))
317 |   qValues = util.Counter(dict([((state, action), 10.5) for state, action in reduce(lambda x,y: x+y, stateCrossActions, [])]))
318 |   prettyPrintQValues(grid, qValues, currentState = (0,0))
319 | 


--------------------------------------------------------------------------------
/week2/assignment/graphicsGridworldDisplay.py:
--------------------------------------------------------------------------------
  1 | # graphicsGridworldDisplay.py
  2 | # ---------------------------
  3 | # Licensing Information: Please do not distribute or publish solutions to this
  4 | # project. You are free to use and extend these projects for educational
  5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
  6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
  7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
  8 | 
  9 | import util
 10 | from graphicsUtils import *
 11 | 
 12 | class GraphicsGridworldDisplay:
 13 |   
 14 |   def __init__(self, gridworld, size=120, speed=1.0):
 15 |     self.gridworld = gridworld
 16 |     self.size = size
 17 |     self.speed = speed
 18 |     
 19 |   def start(self):
 20 |     setup(self.gridworld, size=self.size)
 21 |   
 22 |   def pause(self):
 23 |     wait_for_keys()
 24 |   
 25 |   def displayValues(self, agent, currentState = None, message = 'Agent Values'):
 26 |     values = util.Counter()
 27 |     policy = {}
 28 |     states = self.gridworld.getStates()
 29 |     for state in states:
 30 |       values[state] = agent.getValue(state)
 31 |       policy[state] = agent.getPolicy(state)
 32 |     drawValues(self.gridworld, values, policy, currentState, message)
 33 |     sleep(0.05 / self.speed)
 34 | 
 35 |   def displayNullValues(self, currentState = None, message = ''):
 36 |     values = util.Counter()
 37 |     #policy = {}
 38 |     states = self.gridworld.getStates()
 39 |     for state in states:
 40 |       values[state] = 0.0
 41 |       #policy[state] = agent.getPolicy(state)
 42 |     drawNullValues(self.gridworld, currentState,'')
 43 |     # drawValues(self.gridworld, values, policy, currentState, message)
 44 |     sleep(0.05 / self.speed)
 45 | 
 46 |   def displayQValues(self, agent, currentState = None, message = 'Agent Q-Values'):
 47 |     qValues = util.Counter()
 48 |     states = self.gridworld.getStates()
 49 |     for state in states:
 50 |       for action in self.gridworld.getPossibleActions(state):
 51 |         qValues[(state, action)] = agent.getQValue(state, action)
 52 |     drawQValues(self.gridworld, qValues, currentState, message)
 53 |     sleep(0.05 / self.speed)
 54 | 
 55 | BACKGROUND_COLOR = formatColor(0,0,0)    
 56 | EDGE_COLOR = formatColor(1,1,1)
 57 | OBSTACLE_COLOR = formatColor(0.5,0.5,0.5)
 58 | TEXT_COLOR = formatColor(1,1,1)
 59 | MUTED_TEXT_COLOR = formatColor(0.7,0.7,0.7)
 60 | LOCATION_COLOR = formatColor(0,0,1)
 61 | 
 62 | WINDOW_SIZE = -1
 63 | GRID_SIZE = -1
 64 | GRID_HEIGHT = -1
 65 | MARGIN = -1
 66 | 
 67 | def setup(gridworld, title = "Gridworld Display", size = 120):
 68 |   global GRID_SIZE, MARGIN, SCREEN_WIDTH, SCREEN_HEIGHT, GRID_HEIGHT
 69 |   grid = gridworld.grid
 70 |   WINDOW_SIZE = size
 71 |   GRID_SIZE = size
 72 |   GRID_HEIGHT = grid.height
 73 |   MARGIN = GRID_SIZE * 0.75
 74 |   screen_width = (grid.width - 1) * GRID_SIZE + MARGIN * 2
 75 |   screen_height = (grid.height - 0.5) * GRID_SIZE + MARGIN * 2
 76 | 
 77 |   begin_graphics(screen_width,    
 78 |                  screen_height,
 79 |                  BACKGROUND_COLOR, title=title)
 80 | 
 81 | def drawNullValues(gridworld, currentState = None, message = ''):
 82 |   grid = gridworld.grid
 83 |   blank()
 84 |   for x in range(grid.width):
 85 |     for y in range(grid.height):
 86 |       state = (x, y)
 87 |       gridType = grid[x][y]
 88 |       isExit = (str(gridType) != gridType)
 89 |       isCurrent = (currentState == state)
 90 |       if gridType == '#':
 91 |         drawSquare(x, y, 0, 0, 0, None, None, True, False, isCurrent)
 92 |       else:
 93 |         drawNullSquare(gridworld.grid, x, y, False, isExit, isCurrent)
 94 |   pos = to_screen(((grid.width - 1.0) / 2.0, - 0.8))
 95 |   text( pos, TEXT_COLOR, message, "Courier", -32, "bold", "c")
 96 |   
 97 | 
 98 | def drawValues(gridworld, values, policy, currentState = None, message = 'State Values'):
 99 |   grid = gridworld.grid
100 |   blank()
101 |   valueList = [values[state] for state in gridworld.getStates()] + [0.0]
102 |   minValue = min(valueList)
103 |   maxValue = max(valueList)
104 |   for x in range(grid.width):
105 |     for y in range(grid.height):
106 |       state = (x, y)
107 |       gridType = grid[x][y]
108 |       isExit = (str(gridType) != gridType)
109 |       isCurrent = (currentState == state)
110 |       if gridType == '#':
111 |         drawSquare(x, y, 0, 0, 0, None, None, True, False, isCurrent)
112 |       else:
113 |         value = values[state]
114 |         action = None
115 |         if policy != None and state in policy:
116 |           action = policy[state]
117 |           actions = gridworld.getPossibleActions(state)
118 |         if action not in actions and 'exit' in actions:
119 |           action = 'exit'
120 |         valString = '%.2f' % value
121 |         drawSquare(x, y, value, minValue, maxValue, valString, action, False, isExit, isCurrent)
122 |   pos = to_screen(((grid.width - 1.0) / 2.0, - 0.8))
123 |   text( pos, TEXT_COLOR, message, "Courier", -32, "bold", "c")
124 | 
125 | def drawQValues(gridworld, qValues, currentState = None, message = 'State-Action Q-Values'):
126 |   grid = gridworld.grid
127 |   blank()
128 |   stateCrossActions = [[(state, action) for action in gridworld.getPossibleActions(state)] for state in gridworld.getStates()]
129 |   qStates = reduce(lambda x,y: x+y, stateCrossActions, [])
130 |   qValueList = [qValues[(state, action)] for state, action in qStates] + [0.0]
131 |   minValue = min(qValueList)
132 |   maxValue = max(qValueList)
133 |   for x in range(grid.width):
134 |     for y in range(grid.height):
135 |       state = (x, y)
136 |       gridType = grid[x][y]
137 |       isExit = (str(gridType) != gridType)
138 |       isCurrent = (currentState == state)
139 |       actions = gridworld.getPossibleActions(state)
140 |       if actions == None or len(actions) == 0:
141 |         actions = [None]
142 |       bestQ = max([qValues[(state, action)] for action in actions])
143 |       bestActions = [action for action in actions if qValues[(state, action)] == bestQ]
144 | 
145 |       q = util.Counter()
146 |       valStrings = {}
147 |       for action in actions:
148 |         v = qValues[(state, action)]
149 |         q[action] += v
150 |         valStrings[action] = '%.2f' % v
151 |       if gridType == '#':
152 |         drawSquare(x, y, 0, 0, 0, None, None, True, False, isCurrent)
153 |       elif isExit:
154 |         action = 'exit'
155 |         value = q[action]
156 |         valString = '%.2f' % value
157 |         drawSquare(x, y, value, minValue, maxValue, valString, action, False, isExit, isCurrent)
158 |       else:
159 |         drawSquareQ(x, y, q, minValue, maxValue, valStrings, actions, isCurrent)
160 |   pos = to_screen(((grid.width - 1.0) / 2.0, - 0.8))
161 |   text( pos, TEXT_COLOR, message, "Courier", -32, "bold", "c")
162 | 
163 | 
164 | def blank():
165 |   clear_screen()
166 | 
167 | def drawNullSquare(grid,x, y, isObstacle, isTerminal, isCurrent):      
168 | 
169 |   square_color = getColor(0, -1, 1)
170 |   
171 |   if isObstacle:
172 |     square_color = OBSTACLE_COLOR
173 |     
174 |   (screen_x, screen_y) = to_screen((x, y))
175 |   square( (screen_x, screen_y), 
176 |                  0.5* GRID_SIZE, 
177 |                  color = square_color,
178 |                  filled = 1,
179 |                  width = 1)
180 |   
181 |   square( (screen_x, screen_y), 
182 |                  0.5* GRID_SIZE, 
183 |                  color = EDGE_COLOR,
184 |                  filled = 0,
185 |                  width = 3)
186 |   
187 |   if isTerminal and not isObstacle:
188 |     square( (screen_x, screen_y), 
189 |                  0.4* GRID_SIZE, 
190 |                  color = EDGE_COLOR,
191 |                  filled = 0,
192 |                  width = 2)
193 |     text( (screen_x, screen_y), 
194 |            TEXT_COLOR, 
195 |            str(grid[x][y]), 
196 |            "Courier", -24, "bold", "c")
197 |       
198 |   
199 |   text_color = TEXT_COLOR
200 | 
201 |   if not isObstacle and isCurrent:
202 |     circle( (screen_x, screen_y), 0.1*GRID_SIZE, LOCATION_COLOR, fillColor=LOCATION_COLOR )
203 | 
204 |   # if not isObstacle:
205 |   #   text( (screen_x, screen_y), text_color, valStr, "Courier", 24, "bold", "c")
206 |       
207 | def drawSquare(x, y, val, min, max, valStr, action, isObstacle, isTerminal, isCurrent):
208 | 
209 |   square_color = getColor(val, min, max)
210 |   
211 |   if isObstacle:
212 |     square_color = OBSTACLE_COLOR
213 |     
214 |   (screen_x, screen_y) = to_screen((x, y))
215 |   square( (screen_x, screen_y), 
216 |                  0.5* GRID_SIZE, 
217 |                  color = square_color,
218 |                  filled = 1,
219 |                  width = 1)
220 |   square( (screen_x, screen_y), 
221 |                  0.5* GRID_SIZE, 
222 |                  color = EDGE_COLOR,
223 |                  filled = 0,
224 |                  width = 3)
225 |   if isTerminal and not isObstacle:
226 |     square( (screen_x, screen_y), 
227 |                  0.4* GRID_SIZE, 
228 |                  color = EDGE_COLOR,
229 |                  filled = 0,
230 |                  width = 2)
231 |         
232 |     
233 |   if action == 'north':
234 |     polygon( [(screen_x, screen_y - 0.45*GRID_SIZE), (screen_x+0.05*GRID_SIZE, screen_y-0.40*GRID_SIZE), (screen_x-0.05*GRID_SIZE, screen_y-0.40*GRID_SIZE)], EDGE_COLOR, filled = 1, smoothed = False)
235 |   if action == 'south':
236 |     polygon( [(screen_x, screen_y + 0.45*GRID_SIZE), (screen_x+0.05*GRID_SIZE, screen_y+0.40*GRID_SIZE), (screen_x-0.05*GRID_SIZE, screen_y+0.40*GRID_SIZE)], EDGE_COLOR, filled = 1, smoothed = False)
237 |   if action == 'west':
238 |     polygon( [(screen_x-0.45*GRID_SIZE, screen_y), (screen_x-0.4*GRID_SIZE, screen_y+0.05*GRID_SIZE), (screen_x-0.4*GRID_SIZE, screen_y-0.05*GRID_SIZE)], EDGE_COLOR, filled = 1, smoothed = False)
239 |   if action == 'east':
240 |     polygon( [(screen_x+0.45*GRID_SIZE, screen_y), (screen_x+0.4*GRID_SIZE, screen_y+0.05*GRID_SIZE), (screen_x+0.4*GRID_SIZE, screen_y-0.05*GRID_SIZE)], EDGE_COLOR, filled = 1, smoothed = False)
241 |     
242 |   
243 |   text_color = TEXT_COLOR
244 | 
245 |   if not isObstacle and isCurrent:
246 |     circle( (screen_x, screen_y), 0.1*GRID_SIZE, outlineColor=LOCATION_COLOR, fillColor=LOCATION_COLOR )
247 | 
248 |   if not isObstacle:
249 |     text( (screen_x, screen_y), text_color, valStr, "Courier", -30, "bold", "c")
250 | 
251 | 
252 | def drawSquareQ(x, y, qVals, minVal, maxVal, valStrs, bestActions, isCurrent):
253 | 
254 |   (screen_x, screen_y) = to_screen((x, y))
255 |   
256 |   center = (screen_x, screen_y)
257 |   nw = (screen_x-0.5*GRID_SIZE, screen_y-0.5*GRID_SIZE)
258 |   ne = (screen_x+0.5*GRID_SIZE, screen_y-0.5*GRID_SIZE)
259 |   se = (screen_x+0.5*GRID_SIZE, screen_y+0.5*GRID_SIZE)
260 |   sw = (screen_x-0.5*GRID_SIZE, screen_y+0.5*GRID_SIZE)
261 |   n = (screen_x, screen_y-0.5*GRID_SIZE+5)
262 |   s = (screen_x, screen_y+0.5*GRID_SIZE-5)
263 |   w = (screen_x-0.5*GRID_SIZE+5, screen_y)
264 |   e = (screen_x+0.5*GRID_SIZE-5, screen_y)
265 |   
266 |   actions = qVals.keys()
267 |   for action in actions:
268 |     
269 |     wedge_color = getColor(qVals[action], minVal, maxVal)
270 | 
271 |     if action == 'north':
272 |       polygon( (center, nw, ne), wedge_color, filled = 1, smoothed = False)
273 |       #text(n, text_color, valStr, "Courier", 8, "bold", "n")
274 |     if action == 'south':
275 |       polygon( (center, sw, se), wedge_color, filled = 1, smoothed = False)
276 |       #text(s, text_color, valStr, "Courier", 8, "bold", "s")
277 |     if action == 'east':
278 |       polygon( (center, ne, se), wedge_color, filled = 1, smoothed = False)
279 |       #text(e, text_color, valStr, "Courier", 8, "bold", "e")
280 |     if action == 'west':
281 |       polygon( (center, nw, sw), wedge_color, filled = 1, smoothed = False)
282 |       #text(w, text_color, valStr, "Courier", 8, "bold", "w")
283 |       
284 |   square( (screen_x, screen_y), 
285 |                  0.5* GRID_SIZE, 
286 |                  color = EDGE_COLOR,
287 |                  filled = 0,
288 |                  width = 3)
289 |   line(ne, sw, color = EDGE_COLOR)
290 |   line(nw, se, color = EDGE_COLOR)
291 | 
292 |   if isCurrent:
293 |     circle( (screen_x, screen_y), 0.1*GRID_SIZE, LOCATION_COLOR, fillColor=LOCATION_COLOR )
294 | 
295 |   for action in actions:
296 |     text_color = TEXT_COLOR
297 |     if qVals[action] < max(qVals.values()): text_color = MUTED_TEXT_COLOR
298 |     valStr = ""
299 |     if action in valStrs:
300 |       valStr = valStrs[action]
301 |     h = -20
302 |     if action == 'north':
303 |       #polygon( (center, nw, ne), wedge_color, filled = 1, smooth = 0)
304 |       text(n, text_color, valStr, "Courier", h, "bold", "n")
305 |     if action == 'south':
306 |       #polygon( (center, sw, se), wedge_color, filled = 1, smooth = 0)
307 |       text(s, text_color, valStr, "Courier", h, "bold", "s")
308 |     if action == 'east':
309 |       #polygon( (center, ne, se), wedge_color, filled = 1, smooth = 0)
310 |       text(e, text_color, valStr, "Courier", h, "bold", "e")
311 |     if action == 'west':
312 |       #polygon( (center, nw, sw), wedge_color, filled = 1, smooth = 0)
313 |       text(w, text_color, valStr, "Courier", h, "bold", "w")
314 | 
315 | 
316 | def getColor(val, minVal, max):
317 |   r, g = 0.0, 0.0
318 |   if val < 0 and minVal < 0:
319 |     r = val * 0.65 / minVal
320 |   if val > 0 and max > 0:
321 |     g = val * 0.65 / max
322 |   return formatColor(r,g,0.0)
323 | 
324 | 
325 | def square(pos, size, color, filled, width):
326 |   x, y = pos
327 |   dx, dy = size, size
328 |   return polygon([(x - dx, y - dy), (x - dx, y + dy), (x + dx, y + dy), (x + dx, y - dy)], outlineColor=color, fillColor=color, filled=filled, width=width, smoothed=False)
329 |   
330 |   
331 | def to_screen(point):
332 |   ( gamex, gamey ) = point
333 |   x = gamex*GRID_SIZE + MARGIN  
334 |   y = (GRID_HEIGHT - gamey - 1)*GRID_SIZE + MARGIN  
335 |   return ( x, y )
336 | 
337 | def to_grid(point):
338 |   (x, y) = point
339 |   x = int ((y - MARGIN + GRID_SIZE * 0.5) / GRID_SIZE)
340 |   y = int ((x - MARGIN + GRID_SIZE * 0.5) / GRID_SIZE)
341 |   print point, "-->", (x, y)
342 |   return (x, y)
343 | 


--------------------------------------------------------------------------------