├── week2 ├── assignment │ ├── run_crawler.sh │ ├── layouts │ │ ├── trappedClassic.lay │ │ ├── minimaxClassic.lay │ │ ├── smallGrid.lay │ │ ├── mediumGrid.lay │ │ ├── testClassic.lay │ │ ├── capsuleClassic.lay │ │ ├── smallClassic.lay │ │ ├── contestClassic.lay │ │ ├── openClassic.lay │ │ ├── mediumClassic.lay │ │ ├── trickyClassic.lay │ │ └── originalClassic.lay │ ├── run_grid.sh │ ├── how2run │ ├── run_pacman.sh │ ├── environment.py │ ├── pacmanAgents.py │ ├── analysis.py │ ├── textDisplay.py │ ├── mdp.py │ ├── keyboardAgents.py │ ├── ghostAgents.py │ ├── featureExtractors.py │ ├── qlearningAgents.py │ ├── layout.py │ ├── learningAgents.py │ ├── graphicsCrawlerDisplay.py │ ├── graphicsUtils.py │ ├── textGridworldDisplay.py │ └── graphicsGridworldDisplay.py ├── homework_tips.md ├── alternative │ └── qlearning.py └── README.md ├── week9 ├── all_states.npy ├── action_rewards.npy ├── README.md └── bayes.py ├── docker ├── run_jupyter.sh ├── README.md └── Dockerfile ├── xvfb ├── Amazon GPU howto.md ├── week3 ├── README.md ├── sarsa.py ├── qlearning.py └── expected_value_sarsa.py ├── .gitignore ├── LICENSE.md ├── week0 └── README.md ├── week1 ├── README.md └── breakout.py ├── Dockerfile ├── week6.5 └── README.md ├── week6 └── README.md ├── week7 ├── rockpaperscissors.py └── README.md ├── youtube_dl_lectures.sh ├── week8 ├── README.md └── 8.2_bonus.ipynb ├── week3.5 ├── mnist.py ├── README.md ├── fix_my_nn.ipynb └── Seminar3.5-approx-qlearning.ipynb ├── week5 └── README.md ├── week4 ├── README.md └── Seminar4.0_recap_approx_qlearning.ipynb └── yet_another_week └── README.md /week2/assignment/run_crawler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python crawler.py 3 | 4 | -------------------------------------------------------------------------------- /week9/all_states.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Practical_RL/master/week9/all_states.npy -------------------------------------------------------------------------------- /week2/assignment/layouts/trappedClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%% 2 | % P G% 3 | %G%%%%%% 4 | %.... % 5 | %%%%%%%% 6 | -------------------------------------------------------------------------------- /week2/assignment/run_grid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python gridworld.py -a q -k 100 -n 0 -g BookGrid -e 0.5 3 | -------------------------------------------------------------------------------- /week9/action_rewards.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/Practical_RL/master/week9/action_rewards.npy -------------------------------------------------------------------------------- /docker/run_jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | jupyter notebook --no-browser --allow-root --ip 0.0.0.0 3 | 4 | -------------------------------------------------------------------------------- /week2/assignment/layouts/minimaxClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%% 2 | %.P G% 3 | % %.%G%%% 4 | %G %%% 5 | %%%%%%%%% 6 | -------------------------------------------------------------------------------- /week2/assignment/layouts/smallGrid.lay: -------------------------------------------------------------------------------- 1 | %%%%%%% 2 | % P % 3 | % %%% % 4 | % %. % 5 | % %%% % 6 | %. G % 7 | %%%%%%% 8 | -------------------------------------------------------------------------------- /week2/assignment/layouts/mediumGrid.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%% 2 | %P % 3 | % .% . % 4 | % % % 5 | % .% . % 6 | % G% 7 | %%%%%%%% 8 | -------------------------------------------------------------------------------- /week2/assignment/layouts/testClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%% 2 | % . % 3 | %.G.% 4 | % . % 5 | %. .% 6 | % % 7 | % .% 8 | % % 9 | %P .% 10 | %%%%% 11 | -------------------------------------------------------------------------------- /week2/assignment/layouts/capsuleClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%% 2 | %G. G ....% 3 | %.% % %%%%%% %.%%.% 4 | %.%o% % o% %.o%.% 5 | %.%%%.% %%% %..%.% 6 | %..... P %..%G% 7 | %%%%%%%%%%%%%%%%%%%% 8 | -------------------------------------------------------------------------------- /week2/assignment/layouts/smallClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%% 2 | %......%G G%......% 3 | %.%%...%% %%...%%.% 4 | %.%o.%........%.o%.% 5 | %.%%.%.%%%%%%.%.%%.% 6 | %........P.........% 7 | %%%%%%%%%%%%%%%%%%%% 8 | -------------------------------------------------------------------------------- /week2/assignment/layouts/contestClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%% 2 | %o...%........%...o% 3 | %.%%.%.%%..%%.%.%%.% 4 | %...... G GG%......% 5 | %.%.%%.%% %%%.%%.%.% 6 | %.%....% ooo%.%..%.% 7 | %.%.%%.% %% %.%.%%.% 8 | %o%......P....%....% 9 | %%%%%%%%%%%%%%%%%%%% 10 | -------------------------------------------------------------------------------- /week2/assignment/how2run: -------------------------------------------------------------------------------- 1 | python pacman.py -p PacmanQAgent -x 5000 -n 5010 -l smallGrid 2 | python pacman.py -p PacmanQAgent -x 10000 -n 10010 -l mediumGrid 3 | python pacman.py -p PacmanQAgent -x 100 -n 110 -l mediumClassic 4 | python gridworld.py -a q -k 50 -n 0 -g BridgeGrid -e 1 5 | python crawler.py 6 | 7 | -------------------------------------------------------------------------------- /week2/assignment/layouts/openClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%% 2 | %.. P .... .... % 3 | %.. ... ... ... ... % 4 | %.. ... ... ... ... % 5 | %.. .... .... G % 6 | %.. ... ... ... ... % 7 | %.. ... ... ... ... % 8 | %.. .... .... o% 9 | %%%%%%%%%%%%%%%%%%%%%%%%% 10 | -------------------------------------------------------------------------------- /week2/assignment/layouts/mediumClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%% 2 | %o...%........%....% 3 | %.%%.%.%%%%%%.%.%%.% 4 | %.%..............%.% 5 | %.%.%%.%% %%.%%.%.% 6 | %......%G G%......% 7 | %.%.%%.%%%%%%.%%.%.% 8 | %.%..............%.% 9 | %.%%.%.%%%%%%.%.%%.% 10 | %....%...P....%...o% 11 | %%%%%%%%%%%%%%%%%%%% 12 | -------------------------------------------------------------------------------- /week2/assignment/run_pacman.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python pacman.py -p PacmanQAgent -x 1000 -n 1010 -l smallGrid 3 | 4 | # -x is the amount of training epochs, -n is the total amount of epochs. 5 | # hence, if you want to spend 1337 epochs training and then play 42 for evalution, you will need 6 | # python pacman.py -p PacmanQAgent -x 1337 -n 1379 -l smallGrid 7 | -------------------------------------------------------------------------------- /week2/assignment/layouts/trickyClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%% 2 | %o...%........%...o% 3 | %.%%.%.%%..%%.%.%%.% 4 | %.%.....%..%.....%.% 5 | %.%.%%.%% %%.%%.%.% 6 | %...... GGGG%.%....% 7 | %.%....%%%%%%.%..%.% 8 | %.%....% oo%.%..%.% 9 | %.%....% %%%%.%..%.% 10 | %.%...........%..%.% 11 | %.%%.%.%%%%%%.%.%%.% 12 | %o...%...P....%...o% 13 | %%%%%%%%%%%%%%%%%%%% 14 | -------------------------------------------------------------------------------- /xvfb: -------------------------------------------------------------------------------- 1 | #taken from https://gist.github.com/jterrace/2911875 2 | XVFB=/usr/bin/Xvfb 3 | XVFBARGS=":1 -screen 0 1024x768x24 -ac +extension GLX +render -noreset" 4 | PIDFILE=./xvfb.pid 5 | case "$1" in 6 | start) 7 | echo -n "Starting virtual X frame buffer: Xvfb" 8 | start-stop-daemon --start --quiet --pidfile $PIDFILE --make-pidfile --background --exec $XVFB -- $XVFBARGS 9 | echo "." 10 | ;; 11 | stop) 12 | echo -n "Stopping virtual X frame buffer: Xvfb" 13 | start-stop-daemon --stop --quiet --pidfile $PIDFILE 14 | echo "." 15 | ;; 16 | restart) 17 | $0 stop 18 | $0 start 19 | ;; 20 | *) 21 | echo "Usage: /etc/init.d/xvfb {start|stop|restart}" 22 | exit 1 23 | esac 24 | 25 | exit 0 26 | -------------------------------------------------------------------------------- /Amazon GPU howto.md: -------------------------------------------------------------------------------- 1 | # How to set up GPU on EC2 instance 2 | 3 | ## Create EC2 instance 4 | 5 | Use `p2.xlarge` instance type and `ami-e00a8180` AMI image. [Details](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html) 6 | 7 | Open ports `22` (ssh) and `80` (http) on your freshly created instance, 8 | you create a [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) 9 | and attach it your instance to get ports open 10 | 11 | ## Launch notebook 12 | 13 | Instance you have created contains all you need: fresh versions of theano, lasagne, CUDA driver and cuDNN, 14 | just lunch ipython and get hands dirty: 15 | 16 | ```bash 17 | $ sudo su 18 | $ export THEANO_FLAGS='cuda.root=/usr/local/cuda,device=gpu,floatX=float32' 19 | $ export PATH=/usr/local/cuda-8.0/bin${PATH:+:${PATH}} 20 | $ jupyter notebook 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- /week2/assignment/layouts/originalClassic.lay: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | %............%%............% 3 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 4 | %o%%%%.%%%%%.%%.%%%%%.%%%%o% 5 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 6 | %..........................% 7 | %.%%%%.%%.%%%%%%%%.%%.%%%%.% 8 | %.%%%%.%%.%%%%%%%%.%%.%%%%.% 9 | %......%%....%%....%%......% 10 | %%%%%%.%%%%% %% %%%%%.%%%%%% 11 | %%%%%%.%%%%% %% %%%%%.%%%%%% 12 | %%%%%%.% %.%%%%%% 13 | %%%%%%.% %%%% %%%% %.%%%%%% 14 | % . %G GG G% . % 15 | %%%%%%.% %%%%%%%%%% %.%%%%%% 16 | %%%%%%.% %.%%%%%% 17 | %%%%%%.% %%%%%%%%%% %.%%%%%% 18 | %............%%............% 19 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 20 | %.%%%%.%%%%%.%%.%%%%%.%%%%.% 21 | %o..%%....... .......%%..o% 22 | %%%.%%.%%.%%%%%%%%.%%.%%.%%% 23 | %%%.%%.%%.%%%%%%%%.%%.%%.%%% 24 | %......%%....%%....%%......% 25 | %.%%%%%%%%%%.%%.%%%%%%%%%%.% 26 | %.............P............% 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%% 28 | -------------------------------------------------------------------------------- /week3/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [__Lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture3.pdf&name=lecture3.pdf&c=58a61e4fdc8b) 3 | * [Video 1](https://www.youtube.com/watch?v=PnHCvfgC_ZA) by D.Silver (english) (same as last week) 4 | * [Video 2](https://www.youtube.com/watch?v=0g4j2k_Ggc4&t=43s) by D. Silver (english) 5 | * Our [lecture](https://yadi.sk/i/I7XcP6vU3ExNrT), [seminar](https://yadi.sk/i/XbqNQmjm3ExNsq) (russian) 6 | * Blog post on q-learning Vs SARSA - [url](https://studywolf.wordpress.com/2013/07/01/reinforcement-learning-sarsa-vs-q-learning/) 7 | 8 | ## More materials 9 | * Eligibility traces from Sutton's book - [url](http://incompleteideas.net/sutton/book/ebook/node72.html) 10 | 11 | 12 | ## Homework description 13 | 14 | Go to [the notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week3/homework3.ipynb) and follow instructions from there. 15 | 16 | You will have to modify a few .py files in the meantime (e.g. sarsa.py). 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # node and NPM 2 | npm-debug.log 3 | node_modules 4 | ..bfg-report 5 | 6 | # swap files 7 | *~ 8 | *.swp 9 | 10 | 11 | 12 | env.sh 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | env/ 23 | bin/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg/ 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | 49 | # Translations 50 | *.mo 51 | 52 | # Mr Developer 53 | .mr.developer.cfg 54 | .project 55 | .pydevproject 56 | .idea 57 | .ipynb_checkpoints 58 | 59 | # Rope 60 | .ropeproject 61 | 62 | # Django stuff: 63 | *.log 64 | *.pot 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | docs/tmp* 69 | 70 | # OS X garbage 71 | .DS_Store 72 | 73 | # Debian things 74 | debian/reproducible-experiment-platform 75 | debian/files 76 | *.substvars 77 | *.debhelper.log 78 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Yandex School of Data Analysis and contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /week9/README.md: -------------------------------------------------------------------------------- 1 | [this week is still largely under construction] 2 | ## Exploration and exploitation 3 | * [__main__] David Silver lecture on exploration and expoitation - [video](https://www.youtube.com/watch?v=sGuiWX07sKw) 4 | * Alternative lecture by J. Schulman - [video](https://www.youtube.com/watch?v=SfCa1HQMkuw) 5 | * Our lecture (russian) - [slides](https://yadi.sk/i/JAeItALT3JmvCL), [video](https://yadi.sk/i/bVHmu9gt3Hi9Ym) 6 | * Our lecture on exploration with bayesian neural networks - [slides](https://yadi.sk/i/OANpkyFn3Jmv4J) 7 | 8 | ## More materials 9 | * "Deep" version: variational information maximizing exploration - [video](https://www.youtube.com/watch?v=sRIjxxjVrnY) 10 | * Same topics in russian - [video](https://yadi.sk/i/_2_0yqeW3HDbcn) 11 | * Lecture covering intrinsically motivated reinforcement learning - https://www.youtube.com/watch?v=aJI_9SoBDaQ 12 | * [Slides](https://yadi.sk/i/8sx42nau3HEYKg) 13 | * Same topics in russian - https://www.youtube.com/watch?v=WCE9hhPbCmc 14 | * Note: UCB-1 is not for bernoulli rewards, but for arbitrary r in [0,1], so you can just scale any reward to [0,1] to obtain a peace of mind. It's derived directly from Hoeffding's inequality. 15 | 16 | ## Seminar 17 | In this seminar, you'll be implementing Bayesian UCB and Thompson Sampling for a contextual bandit based on Bayesian Neural Networks. Everything's in the notebook! 18 | -------------------------------------------------------------------------------- /week0/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Materials 4 | * [__Lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture0.pdf&name=lecture0.pdf&c=58a61e7da325) 5 | * __[main]__ Video-intro by David Silver (english) - https://www.youtube.com/watch?v=2pWv7GOvuf0 6 | * Optional lecture by David Silver (english) - https://www.youtube.com/watch?v=lfHX2hHRMVQ 7 | * Intro lecture (russian) - https://yadi.sk/i/bMo0qa-x3DoqkS 8 | * Intro seminar (russian) - https://yadi.sk/i/IBq2MjoS3DoqkY 9 | * Deep learning course (if you want to learn in parallel) - https://github.com/yandexdataschool/HSE_deeplearning 10 | 11 | ## Metaheuristics for optimization 12 | * __[recommended]__ - awesome openai post about evolution strategies - [blog post](https://blog.openai.com/evolution-strategies/), [article](https://arxiv.org/abs/1703.03864) 13 | * Guide to genetic algorithms (english) - https://www.youtube.com/watch?v=ejxfTy4lI6I 14 | * Another guide to genetic algorithm (english) - https://www.youtube.com/watch?v=zwYV11a__HQ 15 | * PDF on Differential evolution (english) - http://jvanderw.une.edu.au/DE_1.pdf 16 | * Video on Ant Colony Algorithm (english) - https://www.youtube.com/watch?v=D58nLNLkb0I 17 | * Longer video on Ant Colony Algorithm (english) - https://www.youtube.com/watch?v=xpyKmjJuqhk 18 | 19 | 20 | ## Homework description 21 | * Go to the [notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week0/frozenlake.ipynb) 22 | * You can find homework and bonus assignment descriptions at the end of that notebook. 23 | -------------------------------------------------------------------------------- /week2/assignment/environment.py: -------------------------------------------------------------------------------- 1 | # environment.py 2 | # -------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | #!/usr/bin/python 10 | 11 | class Environment: 12 | 13 | def getCurrentState(self): 14 | """ 15 | Returns the current state of enviornment 16 | """ 17 | abstract 18 | 19 | def getPossibleActions(self, state): 20 | """ 21 | Returns possible actions the agent 22 | can take in the given state. Can 23 | return the empty list if we are in 24 | a terminal state. 25 | """ 26 | abstract 27 | 28 | def doAction(self, action): 29 | """ 30 | Performs the given action in the current 31 | environment state and updates the enviornment. 32 | 33 | Returns a (reward, nextState) pair 34 | """ 35 | abstract 36 | 37 | def reset(self): 38 | """ 39 | Resets the current state to the start state 40 | """ 41 | abstract 42 | 43 | def isTerminal(self): 44 | """ 45 | Has the enviornment entered a terminal 46 | state? This means there are no successors 47 | """ 48 | state = self.getCurrentState() 49 | actions = self.getPossibleActions(state) 50 | return len(actions) == 0 51 | -------------------------------------------------------------------------------- /week1/README.md: -------------------------------------------------------------------------------- 1 | ## Materials: 2 | * [__Lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture1.pdf&name=lecture1.pdf&c=58a61ec9256c) 3 | * Lecture and seminar videos (russian) - [lecture](https://yadi.sk/i/5yf_4oGI3EDJhJ), [seminar](https://yadi.sk/i/dPsWYMK13EDJj7) _only covering crossentropy method_ 4 | * [__main__] Lecture by J Schulman with crossentropy method explained (english) - https://www.youtube.com/watch?v=aUrX-rP_ss4&list=PLCTc_C7itk-GaAMxmlChrkPnGKtjz8hv1 5 | * [__main__] Sutton's definitive guide to monte-carlo methods - http://incompleteideas.net/sutton/book/ebook/node50.html 6 | * Article about CEM in general - https://people.smp.uq.edu.au/DirkKroese/ps/eormsCE.pdf 7 | * Article about CEM for optimization - https://people.smp.uq.edu.au/DirkKroese/ps/CEopt.pdf 8 | * Article about CEM in reinforcement learning - http://www.aaai.org/Papers/ICML/2003/ICML03-068.pdf 9 | 10 | ## Homework description 11 | * Just follow the [notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week1/crossentropy_method.ipynb) 12 | * During the `CartPole-v0` section (and similar envs), a window will pop up, displaying some game state. The window won't respond to direct input and is instead changes each time you call env.render(). Don't force-close this process, just ignore it until you complete the notebook. 13 | * __important__ the current newest version of gym force-stops environment in 200 steps even if you don't use env.monitor. 14 | * This may ruin CEM on MountainCar. To avoid this, use gym.make("MountainCar-v0").env 15 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | To simplify installation process, you can deploy a container (~virtual machine) with all dependencies pre-installed. 2 | 3 | _tl;dr [dockerhub url](https://hub.docker.com/r/justheuristic/practical_rl/)_ 4 | 5 | ## Install Docker 6 | 7 | We recommend you to use either native docker (recommended for linux) or kitematic(recommended for windows). 8 | * Installing [kitematic](https://kitematic.com/), a simple interface to docker (all platforms) 9 | * Pure docker: Guide for [windows](https://docs.docker.com/docker-for-windows/), [linux](https://docs.docker.com/engine/installation/), or [macOS](https://docs.docker.com/docker-for-mac/). 10 | 11 | Below are the instructions for both approaches. 12 | 13 | ## Kitematic 14 | Find justheuristic/practical_rl in the search menu. Download and launch the container. 15 | 16 | Click on "web preview" screen in the top-right __or__ go to settings, ports and fing at which port your jupyter is located, usually 32***. 17 | 18 | ## Native 19 | `docker run -it -v :/notebooks -p :8888 justheuristic/practical_rl sh ../run_jupyter.sh` 20 | 21 | `docker run -it -v /Users/mittov/Documents/shad/semester4/:/notebooks -p 8888:8888 justheuristic/practical_rl sh ../run_jupyter.sh` 22 | 23 | ## Manual 24 | Build container 25 | 26 | `$ docker build -t rl .` 27 | 28 | 29 | Run it 30 | 31 | `$ docker run --rm -it -v :/notebooks -p :8888 dl` 32 | 33 | examples: 34 | 35 | `$ docker run --rm -it -v /Users/mittov/Documents/shad/semester4/:/notebooks -p 8888:8888 dl` 36 | 37 | Copy the token from console and run 38 | http://localhost:8888/?token= 39 | -------------------------------------------------------------------------------- /week1/breakout.py: -------------------------------------------------------------------------------- 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient""" 2 | import numpy as np 3 | import gym 4 | from scipy.misc import imresize 5 | from gym.core import Wrapper 6 | from gym.spaces.box import Box 7 | 8 | def make_breakout(): 9 | """creates breakout env with all preprocessing done for you""" 10 | return PreprocessAtari(gym.make("BreakoutDeterministic-v0")) 11 | 12 | class PreprocessAtari(Wrapper): 13 | def __init__(self,env,height=64,width=64, 14 | crop=lambda img: img[34:34+160],n_frames=4): 15 | """A gym wrapper that reshapes, crops and scales image into the desired shapes""" 16 | super(PreprocessAtari, self).__init__(env) 17 | self.img_size = (height,width) 18 | self.crop=crop 19 | self.observation_space = Box(0.0, 1.0, [n_frames,height,width]) 20 | self.framebuffer = np.zeros([n_frames,height,width]) 21 | def reset(self): 22 | """resets breakout, returns initial frames""" 23 | self.framebuffer = np.zeros_like(self.framebuffer) 24 | self.update_buffer(self.env.reset()) 25 | return self.framebuffer 26 | def step(self,action): 27 | """plays breakout for 1 step, returns 4-frame buffer""" 28 | new_img,r,done,info = self.env.step(action) 29 | self.update_buffer(new_img) 30 | return self.framebuffer,r,done,info 31 | 32 | ###image processing### 33 | 34 | def update_buffer(self,img): 35 | img = self.preproc_image(img) 36 | self.framebuffer = np.vstack([img[None], self.framebuffer[:-1]]) 37 | 38 | def preproc_image(self, img): 39 | """what happens to the observation""" 40 | img = self.crop(img) 41 | img = imresize(img, self.img_size).mean(-1) 42 | img = img.astype('float32')/255. 43 | return img 44 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM andrewosh/binder-base 2 | MAINTAINER Alexander Panin 3 | USER root 4 | 5 | RUN echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" >> /etc/apt/sources.list 6 | RUN apt-get -qq update 7 | 8 | RUN apt-get install -y gcc g++ wget unzip 9 | RUN apt-get install -y libopenblas-dev liblapack-dev libsdl2-dev libboost-all-dev 10 | RUN apt-get install -y cmake zlib1g-dev libjpeg-dev 11 | RUN apt-get install -y xvfb libav-tools xorg-dev python-opengl 12 | RUN apt-get -y install swig3.0 13 | RUN ln -s /usr/bin/swig3.0 /usr/bin/swig 14 | 15 | 16 | USER main 17 | 18 | RUN pip install --upgrade pip 19 | RUN pip install --upgrade sklearn tqdm 20 | RUN pip install --upgrade gym[all] 21 | RUN pip install --upgrade https://github.com/Theano/Theano/archive/master.zip 22 | RUN pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip 23 | RUN pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip 24 | #RUN pip install --upgrade tensorflow 25 | RUN pip install --upgrade keras 26 | RUN pip install gym_pull 27 | RUN pip install ppaquette-gym-doom 28 | 29 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade pip 30 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade sklearn tqdm 31 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym[all] 32 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Theano/Theano/archive/master.zip 33 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip 34 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip 35 | #RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade tensorflow 36 | RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade keras 37 | #TODO py3 doom once it's no longer broken 38 | -------------------------------------------------------------------------------- /week2/assignment/pacmanAgents.py: -------------------------------------------------------------------------------- 1 | # pacmanAgents.py 2 | # --------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from pacman import Directions 10 | from game import Agent 11 | import random 12 | import game 13 | import util 14 | 15 | class LeftTurnAgent(game.Agent): 16 | "An agent that turns left at every opportunity" 17 | 18 | def getAction(self, state): 19 | legal = state.getLegalPacmanActions() 20 | current = state.getPacmanState().configuration.direction 21 | if current == Directions.STOP: current = Directions.NORTH 22 | left = Directions.LEFT[current] 23 | if left in legal: return left 24 | if current in legal: return current 25 | if Directions.RIGHT[current] in legal: return Directions.RIGHT[current] 26 | if Directions.LEFT[left] in legal: return Directions.LEFT[left] 27 | return Directions.STOP 28 | 29 | class GreedyAgent(Agent): 30 | def __init__(self, evalFn="scoreEvaluation"): 31 | self.evaluationFunction = util.lookup(evalFn, globals()) 32 | assert self.evaluationFunction != None 33 | 34 | def getAction(self, state): 35 | # Generate candidate actions 36 | legal = state.getLegalPacmanActions() 37 | if Directions.STOP in legal: legal.remove(Directions.STOP) 38 | 39 | successors = [(state.generateSuccessor(0, action), action) for action in legal] 40 | scored = [(self.evaluationFunction(state), action) for state, action in successors] 41 | bestScore = max(scored)[0] 42 | bestActions = [pair[1] for pair in scored if pair[0] == bestScore] 43 | return random.choice(bestActions) 44 | 45 | def scoreEvaluation(state): 46 | return state.getScore() -------------------------------------------------------------------------------- /week2/assignment/analysis.py: -------------------------------------------------------------------------------- 1 | # analysis.py 2 | # ----------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | ###################### 10 | # ANALYSIS QUESTIONS # 11 | ###################### 12 | 13 | # Change these default values to obtain the specified policies through 14 | # value iteration. 15 | 16 | def question2a(): 17 | answerDiscount = 0.9 18 | answerNoise = 0.2 19 | answerLivingReward = 0.0 20 | return answerDiscount, answerNoise, answerLivingReward 21 | # If not possible, return 'NOT POSSIBLE' 22 | 23 | def question2b(): 24 | answerDiscount = 0.9 25 | answerNoise = 0.2 26 | answerLivingReward = 0.0 27 | return answerDiscount, answerNoise, answerLivingReward 28 | # If not possible, return 'NOT POSSIBLE' 29 | 30 | def question2c(): 31 | answerDiscount = 0.9 32 | answerNoise = 0.2 33 | answerLivingReward = 0.0 34 | return answerDiscount, answerNoise, answerLivingReward 35 | # If not possible, return 'NOT POSSIBLE' 36 | 37 | def question2d(): 38 | answerDiscount = 0.9 39 | answerNoise = 0.2 40 | answerLivingReward = 0.0 41 | return answerDiscount, answerNoise, answerLivingReward 42 | # If not possible, return 'NOT POSSIBLE' 43 | 44 | def question2e(): 45 | answerDiscount = 0.9 46 | answerNoise = 0.2 47 | answerLivingReward = 0.0 48 | return answerDiscount, answerNoise, answerLivingReward 49 | # If not possible, return 'NOT POSSIBLE' 50 | 51 | if __name__ == '__main__': 52 | print 'Answers to analysis questions:' 53 | import analysis 54 | for q in [q for q in dir(analysis) if q.startswith('question')]: 55 | response = getattr(analysis, q)() 56 | print ' Question %s:\t%s' % (q, str(response)) 57 | -------------------------------------------------------------------------------- /week2/assignment/textDisplay.py: -------------------------------------------------------------------------------- 1 | # textDisplay.py 2 | # -------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | import pacman, time 10 | 11 | DRAW_EVERY = 1 12 | SLEEP_TIME = 0 # This can be overwritten by __init__ 13 | DISPLAY_MOVES = False 14 | QUIET = False # Supresses output 15 | 16 | class NullGraphics: 17 | def initialize(self, state, isBlue = False): 18 | pass 19 | 20 | def update(self, state): 21 | pass 22 | 23 | def pause(self): 24 | time.sleep(SLEEP_TIME) 25 | 26 | def draw(self, state): 27 | print state 28 | 29 | def finish(self): 30 | pass 31 | 32 | class PacmanGraphics: 33 | def __init__(self, speed=None): 34 | if speed != None: 35 | global SLEEP_TIME 36 | SLEEP_TIME = speed 37 | 38 | def initialize(self, state, isBlue = False): 39 | self.draw(state) 40 | self.pause() 41 | self.turn = 0 42 | self.agentCounter = 0 43 | 44 | def update(self, state): 45 | numAgents = len(state.agentStates) 46 | self.agentCounter = (self.agentCounter + 1) % numAgents 47 | if self.agentCounter == 0: 48 | self.turn += 1 49 | if DISPLAY_MOVES: 50 | ghosts = [pacman.nearestPoint(state.getGhostPosition(i)) for i in range(1, numAgents)] 51 | print "%4d) P: %-8s" % (self.turn, str(pacman.nearestPoint(state.getPacmanPosition()))),'| Score: %-5d' % state.score,'| Ghosts:', ghosts 52 | if self.turn % DRAW_EVERY == 0: 53 | self.draw(state) 54 | self.pause() 55 | if state._win or state._lose: 56 | self.draw(state) 57 | 58 | def pause(self): 59 | time.sleep(SLEEP_TIME) 60 | 61 | def draw(self, state): 62 | print state 63 | 64 | def finish(self): 65 | pass 66 | -------------------------------------------------------------------------------- /week2/assignment/mdp.py: -------------------------------------------------------------------------------- 1 | # mdp.py 2 | # ------ 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | import random 10 | 11 | class MarkovDecisionProcess: 12 | 13 | def getStates(self): 14 | """ 15 | Return a list of all states in the MDP. 16 | Not generally possible for large MDPs. 17 | """ 18 | abstract 19 | 20 | def getStartState(self): 21 | """ 22 | Return the start state of the MDP. 23 | """ 24 | abstract 25 | 26 | def getPossibleActions(self, state): 27 | """ 28 | Return list of possible actions from 'state'. 29 | """ 30 | abstract 31 | 32 | def getTransitionStatesAndProbs(self, state, action): 33 | """ 34 | Returns list of (nextState, prob) pairs 35 | representing the states reachable 36 | from 'state' by taking 'action' along 37 | with their transition probabilities. 38 | 39 | Note that in Q-Learning and reinforcment 40 | learning in general, we do not know these 41 | probabilities nor do we directly model them. 42 | """ 43 | abstract 44 | 45 | def getReward(self, state, action, nextState): 46 | """ 47 | Get the reward for the state, action, nextState transition. 48 | 49 | Not available in reinforcement learning. 50 | """ 51 | abstract 52 | 53 | def isTerminal(self, state): 54 | """ 55 | Returns true if the current state is a terminal state. By convention, 56 | a terminal state has zero future rewards. Sometimes the terminal state(s) 57 | may have no possible actions. It is also common to think of the terminal 58 | state as having a self-loop action 'pass' with zero reward; the formulations 59 | are equivalent. 60 | """ 61 | abstract 62 | 63 | 64 | -------------------------------------------------------------------------------- /week6.5/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [Slides](https://yadi.sk/i/-Iqdhg483GDyoN) 3 | * CS231 lecture on RNNs - https://www.youtube.com/watch?v=iX5V1WpxxkY 4 | * Our [lecture](https://yadi.sk/i/XHmT5hO53GcCKV), [seminar](https://yadi.sk/i/19twHESN3GcGKQ) 5 | * [alternative] Brief lecture on RNN by nervana - https://www.youtube.com/watch?v=Ukgii7Yd_cU 6 | * [alternative] More detailed lecture by Y. Bengio - https://www.youtube.com/watch?v=xK-bzjIQkmM 7 | * Great reading by Karpathy - http://karpathy.github.io/2015/05/21/rnn-effectiveness/ 8 | * LSTM explained in detail by colah - http://colah.github.io/posts/2015-08-Understanding-LSTMs/ 9 | 10 | ## More materials 11 | * Seq2seq lecture - https://www.youtube.com/watch?v=G5RY_SUJih4 12 | * "Awesome rnn" entry point - https://github.com/kjw0612/awesome-rnn 13 | * OpenAI research on sentiment analysis that sheds some light on what's inside LSTM language model. 14 | 15 | # Homework description 16 | 17 | You guessed, two options 18 | 19 | ### Lasagne option 20 | 21 | Follow the [first notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week6.5/char_rnn.ipynb) and implement a simple character-level RNN with pure lasagne. The homework part __(4 points)__ is at the very end of that notebook. 22 | 23 | Proceed with [seq2seq](https://github.com/yandexdataschool/Practical_RL/blob/master/week6.5/seq2seq.ipynb) notebook for the second part of homework assignment __(6 points)__. 24 | 25 | ### Alternative 26 | 27 | In this assignment, you will need to implement two things __(pts are same)__: 28 | * A generative RNN model for one of datasets below or for your custom dataset (anything from clickbait to pokemon names) 29 | * A conditional generative model for either [formula]->[common_name] task for molecules dataset below or image captioning [or similar custom dataset]. 30 | 31 | Some helper materials: 32 | * CS231 rnn [assignment](http://cs231n.github.io/assignments2016/assignment3/) 33 | * "Deep models for text and sequences" section of [this course](https://www.udacity.com/course/deep-learning--ud730) 34 | 35 | 36 | ### Datasets 37 | - Names: https://github.com/yandexdataschool/HSE_deeplearning/blob/master/week4/names 38 | - Molecules: https://yadi.sk/d/sYZnG5hK33ktL4 39 | - Questions: https://yadi.sk/d/Dn68_NFx3GBSc8 40 | 41 | -------------------------------------------------------------------------------- /week6/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [Slides](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture6.pdf&name=lecture6.pdf&c=58c876c4863a) 3 | * Video lecture by D. Silver - https://www.youtube.com/watch?v=KHZVXao4qXs 4 | * Our [lecture](https://yadi.sk/i/I3M09HKQ3GKBiP), [seminar](https://yadi.sk/i/8f9NX_E73GKBkT) 5 | * Alternative lecture by J. Schulman part 1 - https://www.youtube.com/watch?v=BB-BhTn6DCM 6 | * Alternative lecture by J. Schulman part 2 - https://www.youtube.com/watch?v=Wnl-Qh2UHGg 7 | 8 | 9 | ## More materials 10 | * Generalizing log-derivative trick - http://blog.shakirm.com/2015/11/machine-learning-trick-of-the-day-5-log-derivative-trick/ 11 | * Combining policy gradient and q-learning - https://arxiv.org/abs/1611.01626 12 | * Bayesian perspective on why reparameterization & logderivative tricks matter (Vetrov's take) - https://www.sdsj.ru/slides/Vetrov.pdf 13 | 14 | 15 | ## Homework 16 | 17 | As usual, "lasagne way" and "other way" 18 | 19 | #### Lasagne way 20 | 21 | First go to Seminar6.0 notebook and implement a vanilla REINFORCE algorithm from scratch. Follow up by playing with advantage actor-critic in Seminar 6.1 - just follow the steps you'll find in the notebook. 22 | 23 | #### Other way 24 | 25 | This week's task is to implement REINFORCE on any continuous state space env (simplest being CartPole-v0) and advantage actor-critic on LunarLander-v2. 26 | 27 | You will find some helpful materials there: 28 | * Tensorflow similar assignment: [cs294 assignment 4](https://github.com/berkeleydeeprlcourse/homework/blob/master/hw4/homework.md) 29 | 30 | 31 | _[copy-pasted section]_ 32 | 33 | We recommend you to upload your results to OpenAI gym and fit your solution in a notebook (ipython/torch/r) unless your framework is incompatible with that. In the latter case, please supply us some notes on what code lies where. 34 | 35 | Again, we recommend you to read the lasagne/agentnet assignments briefly to get the grasp of what parameters to start from. 36 | 37 | Bonus assignments remain exactly the same as in the first track. 38 | 39 | Blindly copy-pasting code from any publically available demos will result in us interrogating you about every signifficant line of code to make sure you at least understand (and regret) what you copypasted. 40 | 41 | 42 | -------------------------------------------------------------------------------- /week7/rockpaperscissors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Toy game for explaining how to work with POMDPs 3 | """ 4 | import gym 5 | from gym import spaces 6 | from gym.utils import seeding 7 | import numpy as np 8 | 9 | class RockPaperScissors(gym.Env): 10 | """ 11 | Rock-paper-scissors game against an imperfect adversary. 12 | Your opponent operates in sequences of 3-7 actions. 13 | There are 5 such pre-defined sequences. 14 | Once enemy finishes his current sequence, he picks next one at random from 5 pre-defined sequences. 15 | 16 | Your observation is enemy's last turn: 17 | - [1,0,0] for rock 18 | - [0,1,0] for paper 19 | - [0,0,1] for scissors 20 | 21 | This game is a toy environment to play with recurrent networks in RL. 22 | """ 23 | #codes of rock, papes and scissors respectively 24 | codes = np.eye(3) 25 | 26 | #list of possible sequences 27 | sequences = ( 28 | (0,1,2,0,1,2), 29 | (1,0,0,1,1), 30 | (2,2,2), 31 | (2,2,1,1,0,0), 32 | (0,0,1,2,1,0,0) 33 | ) 34 | #reward for [i-th] action against [j-th] enemy reaction 35 | reward = ( 36 | # r p s 37 | ( 0, -1, 1), #r 38 | ( 1, 0,-1), #p 39 | (-1, 1, 0), #s 40 | ) 41 | 42 | def __init__(self): 43 | self.action_space = spaces.Discrete(3) 44 | self.observation_space = spaces.Box(0,1,3) 45 | self.reset() 46 | 47 | def get_observation(self): 48 | return self.codes[self.current_sequence[self.current_position]] 49 | 50 | def new_sequence(self): 51 | self.current_sequence = np.random.choice(self.sequences) 52 | self.current_position = 0 53 | 54 | ###public methods 55 | def reset(self): 56 | self.new_sequence() 57 | return self.get_observation() 58 | 59 | def step(self, action): 60 | assert self.action_space.contains(action) 61 | 62 | self.current_position+=1 63 | if self.current_position >= len(self.current_sequence): 64 | self.new_sequence() 65 | 66 | enemy_action = self.current_sequence[self.current_position] 67 | reward = self.reward[action][enemy_action] 68 | return self.get_observation(), reward, False, {} 69 | 70 | def render(*args,**kwargs): 71 | return 0 72 | 73 | -------------------------------------------------------------------------------- /youtube_dl_lectures.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #this script utilizes https://github.com/rg3/youtube-dl/ to download lecture materials in the respective folders. 3 | #you can install youtube-dl via `pip install --upgrade youtube-dl` if you don't have it already. 4 | 5 | #WARNING! the full script downloads gigabytes of mp4! 6 | 7 | #week0 8 | youtube-dl https://www.youtube.com/watch?v=2pWv7GOvuf0 --output week0/Lecture1_Silver.mp4 9 | youtube-dl https://www.youtube.com/watch?v=lfHX2hHRMVQ --output week0/Lecture2_Silver_optional.mp4 10 | 11 | #week1 12 | youtube-dl https://www.youtube.com/watch?v=aUrX-rP_ss4 --output week1/Lecture_Schulman.mp4 13 | 14 | #week2 15 | youtube-dl https://www.youtube.com/watch?v=PnHCvfgC_ZA --output week2/Lecture_Silver.mp4 16 | youtube-dl https://www.youtube.com/watch?v=ifma8G7LegE --output week2/Alternative_lecture_Abbeel.mp4 17 | youtube-dl https://www.youtube.com/watch?v=IL3gVyJMmhg --output week2/Alternative_lecture_Schulman.mp4 18 | 19 | #week3 20 | youtube-dl https://www.youtube.com/watch?v=0g4j2k_Ggc4 --output week3/Lecture_Silver.mp4 21 | 22 | #week3.5 23 | youtube-dl https://www.youtube.com/watch?v=uXt8qF2Zzfo --output week3.5/Lecture_basics.mp4 24 | youtube-dl https://www.youtube.com/watch?v=FmpDIaiMIeA --output week3.5/Lecture_convnets.mp4 25 | youtube-dl https://www.youtube.com/watch?v=OU8I1oJ9HhI --output week3.5/Tutorial_theano.mp4 26 | 27 | #week4 28 | youtube-dl https://www.youtube.com/watch?v=UoPei5o4fps --output week4/Lecture_Silver.mp4 29 | 30 | #week5 31 | youtube-dl https://www.youtube.com/watch?v=h1-pj4Y9-kM --output week5/Lecture_Schulman.mp4 32 | 33 | #week6 34 | youtube-dl https://www.youtube.com/watch?v=KHZVXao4qXs --output week6/Lecture_Silver.mp4 35 | youtube-dl https://www.youtube.com/watch?v=BB-BhTn6DCM --output week6/Alternative_lecture_Schulman_part1.mp4 36 | youtube-dl https://www.youtube.com/watch?v=Wnl-Qh2UHGg --output week6/Alternative_lecture_Schulman_part2.mp4 37 | 38 | #week6.5 39 | youtube-dl https://www.youtube.com/watch?v=iX5V1WpxxkY --output week6.5/Lecture_cs231.mp4 40 | youtube-dl https://www.youtube.com/watch?v=Ukgii7Yd_cU --output week6.5/Alternative_lecture_nervana.mp4 41 | youtube-dl https://www.youtube.com/watch?v=xK-bzjIQkmM --output week6.5/Alternative_lecture_Bengio.mp4 42 | youtube-dl https://www.youtube.com/watch?v=G5RY_SUJih4 --output week6.5/Bonus_lecture_seq2seq.mp4 43 | 44 | 45 | -------------------------------------------------------------------------------- /week2/homework_tips.md: -------------------------------------------------------------------------------- 1 | ### __Pacman features__ 2 | 3 | Try to solve larger grids for pacman setup. 4 | * python pacman.py -p PacmanQAgent -x N_TRAIN_GAMES -n N_TOTAL_GAMES -l __mediumGrid__ 5 | * python pacman.py -p PacmanQAgent -x N_TRAIN_GAMES -n N_TOTAL_GAMES -l __mediumClassic__ 6 | 7 | Even if you adjust N_TRAIN_GAMES to 10^5 and N_TOTAL_GAMES to 10^5+100 (100 last games are for test), pacman won't solve those environments 8 | 9 | The problem with those environments is that they have a large amount of unique states. However, you can devise a smaller environment state by choosing different observation parameters, e.g.: 10 | * distance and direction to nearest ghost 11 | * where is nearest food 12 | * 'center of mass' of all food points (and variance, and whatever) 13 | * is there a wall in each direction 14 | * and anything else you see fit 15 | 16 | Here's how to get this information from [state](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L49), 17 | * Get pacman position: [state.getPacmanPosition()](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L128) 18 | * Is there a wall at (x,y)?: [state.hasWall(x,y)](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L189) 19 | * Get ghost positions: [state.getGhostPositions()](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L144) 20 | * Get all food positions: [state.getCapsules()](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/pacman.py#L153) 21 | 22 | You can call those methods anywhere you see state. 23 | * e.g. in [agent.getValue(state)](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py#L52) 24 | * Defining a function that extracts all features and calling it in [getQValue](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py#L38) and [setQValue](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py#L44) is probably enough. 25 | * You can also change agent parameters. The simplest way is to hard-code them in [PacmanQAgent](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py#L140) 26 | 27 | Also, don't forget to optimize ```learning_rate```, ```discount``` and ```epsilon``` params of model, this may also help to solve this env. 28 | -------------------------------------------------------------------------------- /week8/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [Slides](https://yadi.sk/i/7TkZUDkt3GoPXE) 3 | * Our [lecture](https://yadi.sk/i/-U5w4NpJ3H5TWD), [seminar](https://yadi.sk/i/W3N7-6is3H5TWN) 4 | * The only relevant video-lecture we could find - [video](https://www.youtube.com/watch?v=2tKNpzUvDc4 ) 5 | * Will hopefully record our lecture in english soon! 6 | * Self-critical sequence traning [original article](https://arxiv.org/abs/1612.00563) 7 | 8 | 9 | ## More materials 10 | * An [awesome post](http://distill.pub/2016/augmented-rnns/) explaining attention and long-term memory models. 11 | * [BLEU](http://www.aclweb.org/anthology/P02-1040.pdf) and [CIDEr](https://arxiv.org/pdf/1411.5726.pdf) articles. 12 | * Image captioning 13 | * MSCOCO captioning [challenge](http://mscoco.org/dataset/#captions-challenge2015) 14 | * Captioning baseline [notebook](https://github.com/yandexdataschool/HSE_deeplearning/blob/master/week7/captioning_solution_ars.ipynb) 15 | * Other articles on reinforcement learning for natural language: 16 | * [task-oriented conversation system](https://arxiv.org/abs/1703.07055) 17 | * [generating dialogues](https://arxiv.org/abs/1606.01541) 18 | * [sequential adversarial networks](https://arxiv.org/abs/1609.05473) (a.k.a. SeqGAN) 19 | * A large overview for machine translation (touching on RL, including RL failures) - [article](https://arxiv.org/abs/1609.08144) 20 | * How _not_ to evaluate conversation models - [article](https://arxiv.org/abs/1603.08023) 21 | * Overview of other non-games applications ("that article again") - https://arxiv.org/abs/1701.07274 22 | 23 | ## Homework 24 | 25 | Homework assignment is described in the [main notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week8/8.1_translation_scst.ipynb). 26 | 27 | It's kinda lengthy, but fear not, that's mostly due to formatting. 28 | 29 | __Other frameworks__: as usual, your task remains the same as in the main track: 30 | - Implement or borrow seq2seq model for the same translation task 31 | * Neat tenworflow [repo](https://github.com/cmusphinx/g2p-seq2seq) 32 | * __Important__ - this repo uses simplified phoneme dict - make sure you change preprocessing phase to meaningfully compare results. 33 | - Implement self-critical sequence training ( = basic policy gradient with a special baseline, see notebook) 34 | - Beat the baseline (main notebook: step6) 35 | 36 | Even if you decide to use custom frameworks, it is highly recommended that you reuse evaluation code (e.g. min Levenshtein) from the main notebook to avoid confusion. 37 | 38 | -------------------------------------------------------------------------------- /week3.5/mnist.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | 5 | import numpy as np 6 | 7 | __doc__="""taken from https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py""" 8 | 9 | def load_dataset(): 10 | # We first define a download function, supporting both Python 2 and 3. 11 | if sys.version_info[0] == 2: 12 | from urllib import urlretrieve 13 | else: 14 | from urllib.request import urlretrieve 15 | 16 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'): 17 | print("Downloading %s" % filename) 18 | urlretrieve(source + filename, filename) 19 | 20 | # We then define functions for loading MNIST images and labels. 21 | # For convenience, they also download the requested files if needed. 22 | import gzip 23 | 24 | def load_mnist_images(filename): 25 | if not os.path.exists(filename): 26 | download(filename) 27 | # Read the inputs in Yann LeCun's binary format. 28 | with gzip.open(filename, 'rb') as f: 29 | data = np.frombuffer(f.read(), np.uint8, offset=16) 30 | # The inputs are vectors now, we reshape them to monochrome 2D images, 31 | # following the shape convention: (examples, channels, rows, columns) 32 | data = data.reshape(-1, 1, 28, 28) 33 | # The inputs come as bytes, we convert them to float32 in range [0,1]. 34 | # (Actually to range [0, 255/256], for compatibility to the version 35 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.) 36 | return data / np.float32(256) 37 | 38 | def load_mnist_labels(filename): 39 | if not os.path.exists(filename): 40 | download(filename) 41 | # Read the labels in Yann LeCun's binary format. 42 | with gzip.open(filename, 'rb') as f: 43 | data = np.frombuffer(f.read(), np.uint8, offset=8) 44 | # The labels are vectors of integers now, that's exactly what we want. 45 | return data 46 | 47 | # We can now download and read the training and test set images and labels. 48 | X_train = load_mnist_images('train-images-idx3-ubyte.gz') 49 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz') 50 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz') 51 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz') 52 | 53 | # We reserve the last 10000 training examples for validation. 54 | X_train, X_val = X_train[:-10000], X_train[-10000:] 55 | y_train, y_val = y_train[:-10000], y_train[-10000:] 56 | 57 | # We just return all the arrays in order, as expected in main(). 58 | # (It doesn't matter how we do this as long as we can read them again.) 59 | return X_train, y_train, X_val, y_val, X_test, y_test 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /week3.5/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Materials 3 | * [__Lecture slides__](https://yadi.sk/i/yAO2AJ3M3EKP8g) 4 | * Lecture on deep learning (russian) - https://www.youtube.com/watch?v=8008XQzoUEs 5 | * Seminar on theano (russian) - https://yadi.sk/i/54STsEBVpubkn 6 | * Intro to neural nets and backprop (english) - https://www.youtube.com/watch?v=uXt8qF2Zzfo 7 | * Intro to convnets (english) - https://www.youtube.com/watch?v=FmpDIaiMIeA 8 | * Theano tutorial from Lamblin (english) - https://www.youtube.com/watch?v=OU8I1oJ9HhI 9 | 10 | ## Bonus materials 11 | * Karpathy's course on deep learning (english) - http://cs231n.github.io/ 12 | * Nuts and Bolts of deep learning by Andrew Ng (english) - https://www.youtube.com/watch?v=F1ka6a13S9I 13 | * Deep learning demystified - https://www.youtube.com/watch?v=Q9Z20HCPnww 14 | * Karpathy's lecture on deep learning for computer vision - https://www.youtube.com/watch?v=u6aEYuemt0M 15 | * Our humble DL course: [HSE'autumn16](https://github.com/yandexdataschool/HSE_deeplearning), [Skoltech/YSDA'spring16](https://github.com/ddtm/dl-course/) courses on deep learning (english). 16 | * Srsly, just google `"deep learning %s"%s for s in what_you_want_to_know`. 17 | 18 | ## Homework 19 | 20 | If you are already familiar with lasagne or you are super-good with tensorflow/pytorch/similar, pick one of the _alternative_ options. Otherwise we highly recommend the first one as we'll need convolutional networks soon enough. 21 | 22 | * [__recommended__](https://github.com/yandexdataschool/Practical_RL/blob/master/week3.5/Seminar3.5-en-mnist.ipynb) go to Seminar3.5-*-mnist.ipynb and follow the instructions (ends with lasagne MNIST classifier) 23 | 24 | 25 | * [__alternative task__](https://github.com/yandexdataschool/Practical_RL/blob/master/week3.5/Seminar3.5-approx-qlearning.ipynb) go to Seminar3.5-approx-q-learning.ipynb and follow the instructions (ends with simple NN for q-learning) 26 | 27 | * [__alternative frameworks__] 28 | The equivalent of recommended track would be 29 | * [tensorflow] learning through this [google course](https://www.udacity.com/course/deep-learning--ud730) from start till "Convolutional neural networks" (inclusive). 30 | * [manual/other] surviving past assignment2 of [cs231](http://cs231n.github.io/) 31 | 32 | * [__alternative task and frameworks__] 33 | Implement the simple q-learning network that solves `CartPole-v0`. You're not required to implement experience replay / any advanced stuff, just set sgd learning rate to a small enough number (10^-4) and pray that trains smoothly. 34 | 35 | Here's a convenient translation to tensorflow: [notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week3.5/Seminar3.5-approx-qlearning-tf.ipynb) 36 | 37 | Agent can maintain low reward for long enough, but it should at least show some progress by the end of the default loop. 38 | 39 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | LABEL maintainer "Alexander Panin , Dmitry Mittov " 3 | 4 | 5 | RUN echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" >> /etc/apt/sources.list && \ 6 | apt-get -qq update && \ 7 | apt-get install -y cmake \ 8 | wget \ 9 | unzip \ 10 | git \ 11 | zlib1g-dev \ 12 | libjpeg-dev \ 13 | xvfb \ 14 | libav-tools \ 15 | xorg-dev \ 16 | python-opengl \ 17 | swig3.0 \ 18 | python-dev \ 19 | python3-dev \ 20 | python-pip \ 21 | python3-pip \ 22 | libopenblas-dev \ 23 | liblapack-dev \ 24 | libsdl2-dev \ 25 | libboost-all-dev \ 26 | gcc \ 27 | g++ && \ 28 | ln -s /usr/bin/swig3.0 /usr/bin/swig 29 | 30 | RUN pip install --upgrade pip \ 31 | scipy \ 32 | numpy && \ 33 | pip install --upgrade sklearn \ 34 | jupyter \ 35 | tqdm \ 36 | gym[all] \ 37 | matplotlib \ 38 | seaborn && \ 39 | pip install --upgrade https://github.com/Theano/Theano/archive/master.zip \ 40 | https://github.com/Lasagne/Lasagne/archive/master.zip \ 41 | https://github.com/yandexdataschool/AgentNet/archive/master.zip \ 42 | tensorflow \ 43 | keras 44 | 45 | RUN pip install --upgrade gym_pull ppaquette-gym-doom 46 | 47 | 48 | RUN pip3 install --upgrade pip \ 49 | scipy \ 50 | numpy && \ 51 | pip3 install --upgrade sklearn \ 52 | jupyter \ 53 | tqdm \ 54 | gym[all] \ 55 | matplotlib \ 56 | seaborn && \ 57 | pip3 install --upgrade https://github.com/Theano/Theano/archive/master.zip \ 58 | https://github.com/Lasagne/Lasagne/archive/master.zip \ 59 | https://github.com/yandexdataschool/AgentNet/archive/master.zip \ 60 | tensorflow \ 61 | keras && \ 62 | python3 -m ipykernel.kernelspec 63 | 64 | 65 | EXPOSE 8888 66 | VOLUME /notebooks 67 | WORKDIR /notebooks 68 | 69 | COPY run_jupyter.sh / 70 | CMD ["/run_jupyter.sh"] 71 | -------------------------------------------------------------------------------- /week7/README.md: -------------------------------------------------------------------------------- 1 | # Materials 2 | [lecture slides](https://yadi.sk/d/RGx8BUCr3Gq6DC) 3 | 4 | _Links on all articles mentioned during the lecture could be found in "References" at the very end of the presentation slides. All other interesing links which contribute to the topic of POMDP are presented below_ 5 | 6 | ## Basics 7 | * Our [lecture](https://yadi.sk/i/pMdw-_uI3Gke7Z) and [seminar](https://yadi.sk/i/s1EEuEVd3Gke8k) (russian) 8 | * A lecture on basics by Andrew NG (english, LQ) - [video](https://www.youtube.com/watch?v=yCqPMD6coO8) 9 | * A lecture on lecture by 5vision (russian) - [video](https://www.youtube.com/watch?v=_dkaynuKUFE) 10 | * _[alternative]_ Chalkboard-style 2-part lecture by B. Ravindran. - [part1](https://www.youtube.com/watch?v=9G_KevA8DFY), [part2](https://www.youtube.com/watch?v=dMOUp7YzUpQ) 11 | * _[alternative]_ Yet another mini-lecture touching on POMDP by S.S. Baveja - [video](https://www.youtube.com/watch?v=SE56KgF7aVc) 12 | 13 | ## POMDP Learning 14 | * DRQN lecture by Fritz448 (russian) - [video](https://www.youtube.com/watch?v=bE5DIJvZexc) 15 | * [Data efficient learning in continous POMDP](https://arxiv.org/abs/1602.02523v1) 16 | * [Managing wind farms with bayesian POMDP](http://ascelibrary.org/doi/abs/10.1061/(ASCE)CP.1943-5487.0000390) 17 | * [Bayesian learning and decision-making in dynamic environments](http://www.jmlr.org/papers/volume12/ross11a/ross11a.pdf) 18 | 19 | ## POMDP Planning 20 | * [Introduction to planning in POMDP, ch.: 6](https://www.amazon.com/Decision-Making-Under-Uncertainty-Application/dp/0262029251) 21 | * [Bayes filters in robotics, ch.: 3, 4](https://docs.ufpr.br/~danielsantos/ProbabilisticRobotics.pdf) 22 | * SOTA in scalable approximate __offline__ planning: [SARSOP](http://www.roboticsproceedings.org/rss04/p9.pdf) and [PLEASE](http://www.aaai.org/ocs/index.php/SOCS/SOCS15/paper/viewFile/10686/10627) which is build on top of the former 23 | * SOTA in scalable approximate __online__ planning: [DESPOT](https://arxiv.org/pdf/1609.03250v1.pdf) 24 | * Not SOTA but very useful and enlightening __online__ planning approach: [POMCP](https://papers.nips.cc/paper/4031-monte-carlo-planning-in-large-pomdps.pdf) 25 | * [Realizations of SARSOP, DESPOT and MCVI in C++](http://bigbird.comp.nus.edu.sg/pmwiki/farm/appl/) 26 | * Recent approaches combining POMDP planning with learning on top of neural networks: [Predictron](https://openreview.net/pdf?id=BkJsCIcgl), [historgram filter](https://openreview.net/pdf?id=ByvJuTigl) and [QMDP-Net](https://arxiv.org/pdf/1703.06692.pdf) 27 | 28 | 29 | 30 | 31 | --- 32 | 33 | # Homework 34 | 35 | We have a detailed description of the entire lab in the [homework notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week7/7.3_homework.ipynb) 36 | 37 | Homework is platform and framewerk independent, so choose the ones which suit you best, but pay attention on how many you will need to implement youself in case of nonstandart ones. 38 | -------------------------------------------------------------------------------- /week2/assignment/keyboardAgents.py: -------------------------------------------------------------------------------- 1 | # keyboardAgents.py 2 | # ----------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from game import Agent 10 | from game import Directions 11 | import random 12 | 13 | class KeyboardAgent(Agent): 14 | """ 15 | An agent controlled by the keyboard. 16 | """ 17 | # NOTE: Arrow keys also work. 18 | WEST_KEY = 'a' 19 | EAST_KEY = 'd' 20 | NORTH_KEY = 'w' 21 | SOUTH_KEY = 's' 22 | STOP_KEY = 'q' 23 | 24 | def __init__( self, index = 0 ): 25 | 26 | self.lastMove = Directions.STOP 27 | self.index = index 28 | self.keys = [] 29 | 30 | def getAction( self, state): 31 | from graphicsUtils import keys_waiting 32 | from graphicsUtils import keys_pressed 33 | keys = keys_waiting() + keys_pressed() 34 | if keys != []: 35 | self.keys = keys 36 | 37 | legal = state.getLegalActions(self.index) 38 | move = self.getMove(legal) 39 | 40 | if move == Directions.STOP: 41 | # Try to move in the same direction as before 42 | if self.lastMove in legal: 43 | move = self.lastMove 44 | 45 | if (self.STOP_KEY in self.keys) and Directions.STOP in legal: move = Directions.STOP 46 | 47 | if move not in legal: 48 | move = random.choice(legal) 49 | 50 | self.lastMove = move 51 | return move 52 | 53 | def getMove(self, legal): 54 | move = Directions.STOP 55 | if (self.WEST_KEY in self.keys or 'Left' in self.keys) and Directions.WEST in legal: move = Directions.WEST 56 | if (self.EAST_KEY in self.keys or 'Right' in self.keys) and Directions.EAST in legal: move = Directions.EAST 57 | if (self.NORTH_KEY in self.keys or 'Up' in self.keys) and Directions.NORTH in legal: move = Directions.NORTH 58 | if (self.SOUTH_KEY in self.keys or 'Down' in self.keys) and Directions.SOUTH in legal: move = Directions.SOUTH 59 | return move 60 | 61 | class KeyboardAgent2(KeyboardAgent): 62 | """ 63 | A second agent controlled by the keyboard. 64 | """ 65 | # NOTE: Arrow keys also work. 66 | WEST_KEY = 'j' 67 | EAST_KEY = "l" 68 | NORTH_KEY = 'i' 69 | SOUTH_KEY = 'k' 70 | STOP_KEY = 'u' 71 | 72 | def getMove(self, legal): 73 | move = Directions.STOP 74 | if (self.WEST_KEY in self.keys) and Directions.WEST in legal: move = Directions.WEST 75 | if (self.EAST_KEY in self.keys) and Directions.EAST in legal: move = Directions.EAST 76 | if (self.NORTH_KEY in self.keys) and Directions.NORTH in legal: move = Directions.NORTH 77 | if (self.SOUTH_KEY in self.keys) and Directions.SOUTH in legal: move = Directions.SOUTH 78 | return move 79 | 80 | 81 | -------------------------------------------------------------------------------- /week2/assignment/ghostAgents.py: -------------------------------------------------------------------------------- 1 | # ghostAgents.py 2 | # -------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from game import Agent 10 | from game import Actions 11 | from game import Directions 12 | import random 13 | from util import manhattanDistance 14 | import util 15 | 16 | class GhostAgent( Agent ): 17 | def __init__( self, index ): 18 | self.index = index 19 | 20 | def getAction( self, state ): 21 | dist = self.getDistribution(state) 22 | if len(dist) == 0: 23 | return Directions.STOP 24 | else: 25 | return util.chooseFromDistribution( dist ) 26 | 27 | def getDistribution(self, state): 28 | "Returns a Counter encoding a distribution over actions from the provided state." 29 | util.raiseNotDefined() 30 | 31 | class RandomGhost( GhostAgent ): 32 | "A ghost that chooses a legal action uniformly at random." 33 | def getDistribution( self, state ): 34 | dist = util.Counter() 35 | for a in state.getLegalActions( self.index ): dist[a] = 1.0 36 | dist.normalize() 37 | return dist 38 | 39 | class DirectionalGhost( GhostAgent ): 40 | "A ghost that prefers to rush Pacman, or flee when scared." 41 | def __init__( self, index, prob_attack=0.8, prob_scaredFlee=0.8 ): 42 | self.index = index 43 | self.prob_attack = prob_attack 44 | self.prob_scaredFlee = prob_scaredFlee 45 | 46 | def getDistribution( self, state ): 47 | # Read variables from state 48 | ghostState = state.getGhostState( self.index ) 49 | legalActions = state.getLegalActions( self.index ) 50 | pos = state.getGhostPosition( self.index ) 51 | isScared = ghostState.scaredTimer > 0 52 | 53 | speed = 1 54 | if isScared: speed = 0.5 55 | 56 | actionVectors = [Actions.directionToVector( a, speed ) for a in legalActions] 57 | newPositions = [( pos[0]+a[0], pos[1]+a[1] ) for a in actionVectors] 58 | pacmanPosition = state.getPacmanPosition() 59 | 60 | # Select best actions given the state 61 | distancesToPacman = [manhattanDistance( pos, pacmanPosition ) for pos in newPositions] 62 | if isScared: 63 | bestScore = max( distancesToPacman ) 64 | bestProb = self.prob_scaredFlee 65 | else: 66 | bestScore = min( distancesToPacman ) 67 | bestProb = self.prob_attack 68 | bestActions = [action for action, distance in zip( legalActions, distancesToPacman ) if distance == bestScore] 69 | 70 | # Construct distribution 71 | dist = util.Counter() 72 | for a in bestActions: dist[a] = bestProb / len(bestActions) 73 | for a in legalActions: dist[a] += ( 1-bestProb ) / len(legalActions) 74 | dist.normalize() 75 | return dist 76 | -------------------------------------------------------------------------------- /week5/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * Slides [here](https://yadi.sk/i/P02qoHng3G7oMt) 3 | * Video lecture (esp. second half) by J. Schulman - https://www.youtube.com/watch?v=h1-pj4Y9-kM 4 | * Our [lecture](https://yadi.sk/i/yBO0q4mI3GAxYd), [seminar](https://yadi.sk/i/oWC2M5803GAyFB) (russian) 5 | * Article on dueling DQN - https://arxiv.org/pdf/1511.06581.pdf 6 | * Article on double DQN - https://arxiv.org/abs/1509.06461 7 | * Article on prioritized experience replay - https://arxiv.org/abs/1511.05952 8 | * Video on asynchronuous methods (Mnih) - https://www.youtube.com/watch?v=9sx1_u2qVhQ 9 | * Article on bootstrap DQN - https://papers.nips.cc/paper/6501-deep-exploration-via-bootstrapped-dqn.pdf, [summary](http://pemami4911.github.io/paper-summaries/2016/08/16/Deep-exploration.html) 10 | 11 | 12 | ## More materials 13 | * [recommended] An overview of deep reinforcement learning - https://arxiv.org/pdf/1701.07274v1.pdf 14 | * Reinforcement learning architectures list - https://github.com/5vision/deep-reinforcement-learning-networks 15 | * Building deep q-network from ~scratch (blog) - https://jaromiru.com/2016/09/27/lets-make-a-dqn-theory/ 16 | * Another guide guide to DQN from ~scratch (blog) - https://rubenfiszel.github.io/posts/rl4j/2016-08-24-Reinforcement-Learning-and-DQN.html 17 | * Article on asynchronuous methods in deep RL - https://arxiv.org/abs/1602.01783 18 | * Successor representations for reinforcement learning - [article](https://arxiv.org/abs/1606.02396), [video](https://www.youtube.com/watch?v=kNqXCn7K-BM&feature=youtu.be) 19 | * [recap] Slides on basic DQN, including target networks - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf 20 | 21 | 22 | ## Homework 23 | 24 | As usual, "lasagne way" and "other way" 25 | 26 | #### Lasagne way 27 | 28 | Basically go to [the notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week5/Seminar5_deep_rl.ipynb) and follow what's inside. 29 | 30 | #### Other way 31 | 32 | This week's task is to implement (and hopefully compare) target networks, double DQN and/or duelling DQN and training on atari breakout. 33 | 34 | * Tensorflow template: [cs294 assignment 3](https://github.com/berkeleydeeprlcourse/homework/tree/master/hw3) 35 | 36 | Implementing prioritized experience replay or bootstrap dqn or any other cool stuff yields you bonus points. You can also choose a different environment if you have issues with breakout, but don't get too complicated. E.g. your DQN will likely _fail_ on Montezuma Revenge unless you do weird stuff with reward function. 37 | 38 | We recommend you to upload your results to OpenAI gym and fit your solution in a notebook (ipython/torch/r) unless your framework is incompatible with that. In the latter case, please supply us some notes on what code lies where. 39 | 40 | Again,we recommend you to read the lasagne/agentnet assignments briefly to get the grasp of what parameters to start from. 41 | 42 | Bonus assignments remain exactly the same as in the first track. 43 | 44 | Blindly copy-pasting code from any publically available demos will result in us interrogating you about every signifficant line of code to make sure you at least understand (and regret) what you copypasted. 45 | 46 | 47 | -------------------------------------------------------------------------------- /week2/assignment/featureExtractors.py: -------------------------------------------------------------------------------- 1 | # featureExtractors.py 2 | # -------------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | "Feature extractors for Pacman game states" 10 | 11 | from game import Directions, Actions 12 | import util 13 | 14 | class FeatureExtractor: 15 | def getFeatures(self, state, action): 16 | """ 17 | Returns a dict from features to counts 18 | Usually, the count will just be 1.0 for 19 | indicator functions. 20 | """ 21 | util.raiseNotDefined() 22 | 23 | class IdentityExtractor(FeatureExtractor): 24 | def getFeatures(self, state, action): 25 | feats = util.Counter() 26 | feats[(state,action)] = 1.0 27 | return feats 28 | 29 | def closestFood(pos, food, walls): 30 | """ 31 | closestFood -- this is similar to the function that we have 32 | worked on in the search project; here its all in one place 33 | """ 34 | fringe = [(pos[0], pos[1], 0)] 35 | expanded = set() 36 | while fringe: 37 | pos_x, pos_y, dist = fringe.pop(0) 38 | if (pos_x, pos_y) in expanded: 39 | continue 40 | expanded.add((pos_x, pos_y)) 41 | # if we find a food at this location then exit 42 | if food[pos_x][pos_y]: 43 | return dist 44 | # otherwise spread out from the location to its neighbours 45 | nbrs = Actions.getLegalNeighbors((pos_x, pos_y), walls) 46 | for nbr_x, nbr_y in nbrs: 47 | fringe.append((nbr_x, nbr_y, dist+1)) 48 | # no food found 49 | return None 50 | 51 | class SimpleExtractor(FeatureExtractor): 52 | """ 53 | Returns simple features for a basic reflex Pacman: 54 | - whether food will be eaten 55 | - how far away the next food is 56 | - whether a ghost collision is imminent 57 | - whether a ghost is one step away 58 | """ 59 | 60 | def getFeatures(self, state, action): 61 | # extract the grid of food and wall locations and get the ghost locations 62 | food = state.getFood() 63 | walls = state.getWalls() 64 | ghosts = state.getGhostPositions() 65 | 66 | features = util.Counter() 67 | 68 | features["bias"] = 1.0 69 | 70 | # compute the location of pacman after he takes the action 71 | x, y = state.getPacmanPosition() 72 | dx, dy = Actions.directionToVector(action) 73 | next_x, next_y = int(x + dx), int(y + dy) 74 | 75 | # count the number of ghosts 1-step away 76 | features["#-of-ghosts-1-step-away"] = sum((next_x, next_y) in Actions.getLegalNeighbors(g, walls) for g in ghosts) 77 | 78 | # if there is no danger of ghosts then add the food feature 79 | if not features["#-of-ghosts-1-step-away"] and food[next_x][next_y]: 80 | features["eats-food"] = 1.0 81 | 82 | dist = closestFood((next_x, next_y), food, walls) 83 | if dist is not None: 84 | # make the distance a number less than one otherwise the update 85 | # will diverge wildly 86 | features["closest-food"] = float(dist) / (walls.width * walls.height) 87 | features.divideAll(10.0) 88 | return features -------------------------------------------------------------------------------- /week2/alternative/qlearning.py: -------------------------------------------------------------------------------- 1 | # qlearningAgents.py 2 | # ------------------ 3 | ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 4 | 5 | import random,math 6 | 7 | import numpy as np 8 | from collections import defaultdict 9 | 10 | class QLearningAgent(): 11 | """ 12 | Q-Learning Agent 13 | 14 | Instance variables you have access to 15 | - self.epsilon (exploration prob) 16 | - self.alpha (learning rate) 17 | - self.discount (discount rate aka gamma) 18 | 19 | Functions you should use 20 | - self.getLegalActions(state) 21 | which returns legal actions for a state 22 | - self.getQValue(state,action) 23 | which returns Q(state,action) 24 | - self.setQValue(state,action,value) 25 | which sets Q(state,action) := value 26 | 27 | !!!Important!!! 28 | NOTE: please avoid using self._qValues directly to make code cleaner 29 | """ 30 | def __init__(self,alpha,epsilon,discount,getLegalActions): 31 | "We initialize agent and Q-values here." 32 | self.getLegalActions= getLegalActions 33 | self._qValues = defaultdict(lambda:defaultdict(lambda:0)) 34 | self.alpha = alpha 35 | self.epsilon = epsilon 36 | self.discount = discount 37 | 38 | def getQValue(self, state, action): 39 | """ 40 | Returns Q(state,action) 41 | """ 42 | return self._qValues[state][action] 43 | 44 | def setQValue(self,state,action,value): 45 | """ 46 | Sets the Qvalue for [state,action] to the given value 47 | """ 48 | self._qValues[state][action] = value 49 | 50 | #---------------------#start of your code#---------------------# 51 | 52 | def getValue(self, state): 53 | """ 54 | Returns max_action Q(state,action) 55 | where the max is over legal actions. 56 | """ 57 | 58 | possibleActions = self.getLegalActions(state) 59 | #If there are no legal actions, return 0.0 60 | if len(possibleActions) == 0: 61 | return 0.0 62 | 63 | "*** YOUR CODE HERE ***" 64 | return 65 | 66 | def getPolicy(self, state): 67 | """ 68 | Compute the best action to take in a state. 69 | 70 | """ 71 | possibleActions = self.getLegalActions(state) 72 | 73 | #If there are no legal actions, return None 74 | if len(possibleActions) == 0: 75 | return None 76 | 77 | best_action = None 78 | 79 | "*** YOUR CODE HERE ***" 80 | best_action = 81 | return best_action 82 | 83 | def getAction(self, state): 84 | """ 85 | Compute the action to take in the current state, including exploration. 86 | 87 | With probability self.epsilon, we should take a random action. 88 | otherwise - the best policy action (self.getPolicy). 89 | 90 | HINT: You might want to use util.flipCoin(prob) 91 | HINT: To pick randomly from a list, use random.choice(list) 92 | 93 | """ 94 | 95 | # Pick Action 96 | possibleActions = self.getLegalActions(state) 97 | action = None 98 | 99 | #If there are no legal actions, return None 100 | if len(possibleActions) == 0: 101 | return None 102 | 103 | #agent parameters: 104 | epsilon = self.epsilon 105 | 106 | "*** YOUR CODE HERE ***" 107 | 108 | return 109 | 110 | def update(self, state, action, nextState, reward): 111 | """ 112 | You should do your Q-Value update here 113 | 114 | NOTE: You should never call this function, 115 | it will be called on your behalf 116 | 117 | 118 | """ 119 | #agent parameters 120 | gamma = self.discount 121 | learning_rate = self.alpha 122 | 123 | "*** YOUR CODE HERE ***" 124 | reference_qvalue = 125 | 126 | updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue 127 | self.setQValue(state,action,updated_qvalue) 128 | 129 | 130 | #---------------------#end of your code#---------------------# 131 | 132 | 133 | -------------------------------------------------------------------------------- /week3/sarsa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Expected Value SARSA 3 | This file builds upon the same functions as Q-learning agent (qlearning.py). 4 | 5 | [assignment] 6 | The only thing you must implement is the getValue method. 7 | - Recall that V(s) in SARSA is not the maximal but the expected Q-value. 8 | - The expectation should be done under agent's policy (e-greedy). 9 | 10 | 11 | Here's usage example: 12 | >>>from sarsa import SarsaAgent 13 | 14 | >>>agent = SarsaAgent(alpha=0.1,epsilon=0.25,discount=0.99, 15 | getLegalActions = lambda s: actions_from_that_state) 16 | >>>action = agent.getAction(state) 17 | >>>agent.update(state,action, next_state,reward) 18 | >>>agent.epsilon *= 0.99 19 | """ 20 | import random,math 21 | 22 | import numpy as np 23 | from collections import defaultdict 24 | 25 | class SarsaAgent(): 26 | """ 27 | Classical SARSA agent. 28 | 29 | The two main methods are 30 | - self.getAction(state) - returns agent's action in that state 31 | - self.update(state,action,reward,nextState,nextAction) - returns agent's next action 32 | 33 | Instance variables you have access to 34 | - self.epsilon (exploration prob) 35 | - self.alpha (learning rate) 36 | - self.discount (discount rate aka gamma) 37 | 38 | """ 39 | def __init__(self,alpha,epsilon,discount,getLegalActions): 40 | "We initialize agent and Q-values here." 41 | self.getLegalActions= getLegalActions 42 | self._qValues = defaultdict(lambda:defaultdict(lambda:0)) 43 | self.alpha = alpha 44 | self.epsilon = epsilon 45 | self.discount = discount 46 | 47 | def getQValue(self, state, action): 48 | """ 49 | Returns Q(state,action) 50 | """ 51 | return self._qValues[state][action] 52 | 53 | def setQValue(self,state,action,value): 54 | """ 55 | Sets the Qvalue for [state,action] to the given value 56 | """ 57 | self._qValues[state][action] = value 58 | 59 | #---------------------#start of your code#---------------------# 60 | 61 | def getPolicy(self, state): 62 | """ 63 | Compute the best action to take in a state. 64 | 65 | """ 66 | possibleActions = self.getLegalActions(state) 67 | 68 | #If there are no legal actions, return None 69 | if len(possibleActions) == 0: 70 | return None 71 | 72 | best_action = None 73 | 74 | "*** this code works exactly as Q-learning ***" 75 | best_action = possibleActions[np.argmax([self.getQValue(state, a) for a in possibleActions])] 76 | return best_action 77 | 78 | def getAction(self, state): 79 | """ 80 | Compute the action to take in the current state, including exploration. 81 | 82 | With probability self.epsilon, we should take a random action. 83 | otherwise - the best policy action (self.getPolicy). 84 | 85 | HINT: You might want to use util.flipCoin(prob) 86 | HINT: To pick randomly from a list, use random.choice(list) 87 | 88 | """ 89 | 90 | # Pick Action 91 | possibleActions = self.getLegalActions(state) 92 | action = None 93 | 94 | #If there are no legal actions, return None 95 | if len(possibleActions) == 0: 96 | return None 97 | 98 | #agent parameters: 99 | epsilon = self.epsilon 100 | 101 | "*** Epsilon-greedy strategy exactly as Q-learning ***" 102 | if np.random.random()<=epsilon: 103 | return random.choice(possibleActions) 104 | else: 105 | action = self.getPolicy(state) 106 | return action 107 | 108 | def update(self, state, action, nextState,nextAction, reward): 109 | """ 110 | You should do your Q-Value update here 111 | 112 | NOTE: You should never call this function, 113 | it will be called on your behalf 114 | 115 | 116 | """ 117 | #agent parameters 118 | gamma = self.discount 119 | learning_rate = self.alpha 120 | 121 | "*** YOUR CODE HERE ***" 122 | reference_qvalue = 123 | 124 | updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue 125 | 126 | self.setQValue(state,action,updated_qvalue) 127 | 128 | 129 | #---------------------#end of your code#---------------------# 130 | 131 | 132 | -------------------------------------------------------------------------------- /week4/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [__lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture4.pdf&name=lecture4.pdf&c=58b0d2eb4e0f) 3 | * David Silver lecture - https://www.youtube.com/watch?v=UoPei5o4fps&t=3s 4 | * More practical and less theoretical lecture from MIT 6.S191 - https://www.youtube.com/watch?v=xWe58WGWmlk 5 | * Our [lecture](https://yadi.sk/i/AHDU2p_j3FT3nr), [seminar](https://yadi.sk/i/EeUeheri3FT3ra) (russian) 6 | * Understanding approximate q-learning - https://danieltakeshi.github.io/2016/10/31/going-deeper-into-reinforcement-learning-understanding-q-learning-and-linear-function-approximation/ 7 | * Karpathy's post on approximate RL - http://karpathy.github.io/2016/05/31/rl/ 8 | 9 | ## More materials 10 | * __[recommended]__ How to _actually_ do deep reinforcement learning by J. Schulman - http://rll.berkeley.edu/deeprlcourse/docs/nuts-and-bolts.pdf 11 | * interactive demos in your browser: [demo1](http://cs.stanford.edu/people/karpathy/convnetjs/demo/rldemo.html)(karpathy), [demo2](http://janhuenermann.com/projects/learning-to-drive)(Hünermann) 12 | * A guide to deep RL from ~scratch (nervana blog) - https://www.nervanasys.com/demystifying-deep-reinforcement-learning/ 13 | 14 | 15 | ## Homework 16 | 17 | From now on, we introduce an alternative homework track that's not tied to lasagne/agentnet/rllab/any_other_framework. In that track, you'll be tasked with similar problems, but they will not be tied to jupyter notebooks with lasagne networks. 18 | 19 | You can choose whichever track you want, but unless you're expertly familiar with your framework, we recommend you to start by completing the task in lasagne and only then reproduce your solution in your chosen framework. 20 | 21 | 22 | #### Recommended path 23 | 24 | * Step 1 - go to [Seminar4.1](https://github.com/yandexdataschool/Practical_RL/blob/master/week4/Seminar4.1_experience_replay.ipynb), complete it and make sure it reaches the desired reward on Acrobot-v1. Then go to homework section (at the end) and follow the instructions from there. 25 | * Tip - for your network to work properly on Acrobot-v1, please either use non-saturated nonlinearities (elu/leaky_relu/softplus), or normalize observations, or initialize with smaller weights. Otherwise, e.g. sigmoid may get saturated and fail to learn anything. 26 | * Step 2 - go to [Seminar4.2](https://github.com/yandexdataschool/Practical_RL/blob/master/week4/Seminar4.2_conv_agent.ipynb) and make it beat DoomBasic. 27 | 28 | Doom environments are powered by VizDoom (via doom_py), which may require separate installation. If you're using [docker container](https://github.com/yandexdataschool/Practical_RL/blob/master/docker) or running in binder, the dependency should already be installed. 29 | 30 | To install doom envs manually, follow the instructions at the top of the [Seminar4.2](https://github.com/yandexdataschool/Practical_RL/blob/master/week4/Seminar4.2_conv_agent.ipynb) notebook. 31 | 32 | For example, on python2, ubuntu 14, stardate 2017.02.27 it took us to 33 | ``` 34 | apt-get install -y gcc g++ wget unzip libsdl2-dev libboost-all-dev 35 | pip install gym_pull 36 | pip install ppaquette-gym-doom 37 | ``` 38 | 39 | For macOS (OS X) install brew and then 40 | ``` 41 | brew install boost boost-python sdl2 cmake 42 | pip install ppaquette-gym-doom 43 | ```` 44 | 45 | If it just won't get installed, pick `BreakoutDeterministic-v0` and try to get average reward >= +10 46 | 47 | 48 | #### Alternative frameworks 49 | 50 | The task is to implement approximate Q-learning with experience replay and show that it works on `Acrobot-v1`,`LunarLander-v2` and `ppaquette/DoomBasic-v0` (or other versions of those environments). 51 | 52 | If you use tensorflow, there's a very convenient [notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week4/Seminar4.0_recap_approx_qlearning-tf.ipynb) for you to start (by [Scitator](https://github.com/Scitator)) 53 | 54 | We, however, recommend you to read the lasagne/agentnet assignments briefly to get the grasp of what parameters to start from. 55 | 56 | Your're also recommended to fit your solution in a notebook (ipython/torch/r) unless your framework is incompatible with that. In the latter case, please supply us some notes on what code lies where. 57 | 58 | Bonus assignments remain exactly the same as in the first track. 59 | 60 | Blindly copy-pasting code from any publically available demos will result in us interrogating you about every signifficant line of code to make sure you at least understand (and regret) what you copypasted. 61 | 62 | 63 | -------------------------------------------------------------------------------- /week3.5/fix_my_nn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from lasagne.layers import *\n", 12 | "from lasagne.nonlinearities import *\n", 13 | "from lasagne import init" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "nn = InputLayer([None,3,100,100])\n", 25 | "\n", 26 | "nn = Conv2DLayer(nn,num_filters=512, filter_size=(3,3),\n", 27 | " W = init.Constant(0))\n", 28 | "\n", 29 | "nn = Conv2DLayer(nn,num_filters=128,filter_size=(3,3),\n", 30 | " W = init.Constant(0))\n", 31 | "\n", 32 | "nn = Conv2DLayer(nn,num_filters=32,filter_size=(3,3),\n", 33 | " W = init.Constant(0))\n", 34 | "\n", 35 | "nn = Pool2DLayer(nn,pool_size=(6,6),mode='max')\n", 36 | "\n", 37 | "nn = Conv2DLayer(nn,num_filters=8,filter_size=(10,10),\n", 38 | " W = init.Normal(std=0.01))\n", 39 | "\n", 40 | "nn = Conv2DLayer(nn,num_filters=8,filter_size=(10,10),\n", 41 | " W = init.Normal(std=0.01))\n", 42 | "\n", 43 | "nn = Pool2DLayer(nn,pool_size=(3,3),mode='max')\n", 44 | "\n", 45 | "nn = DenseLayer(nn,512,nonlinearity=softmax)\n", 46 | "\n", 47 | "nn = DropoutLayer(nn,p=0.5)\n", 48 | "\n", 49 | "nn = DenseLayer(nn,512,nonlinearity=softmax)\n", 50 | "\n", 51 | "nn = DenseLayer(nn,10,nonlinearity=sigmoid)\n", 52 | "\n", 53 | "nn = DropoutLayer(nn,p=0.5)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "```\n", 61 | "\n", 62 | "```\n", 63 | "\n", 64 | "```\n", 65 | "\n", 66 | "```\n", 67 | "\n", 68 | "```\n", 69 | "\n", 70 | "```\n", 71 | "\n", 72 | "```\n", 73 | "\n", 74 | "```\n", 75 | "\n", 76 | "```\n", 77 | "\n", 78 | "```\n", 79 | "\n", 80 | "```\n", 81 | "\n", 82 | "```\n", 83 | "\n", 84 | "```\n", 85 | "\n", 86 | "```\n", 87 | "\n", 88 | "```\n", 89 | "\n", 90 | "```\n", 91 | "\n", 92 | "```\n", 93 | "\n", 94 | "```\n", 95 | "\n", 96 | "```\n", 97 | "\n", 98 | "```\n", 99 | "\n", 100 | "```\n", 101 | "\n", 102 | "```\n", 103 | "\n", 104 | "```\n", 105 | "\n", 106 | "```\n", 107 | "\n", 108 | "```\n", 109 | "\n", 110 | "```\n", 111 | "\n", 112 | "```\n", 113 | "\n", 114 | "```\n", 115 | "\n", 116 | "```\n", 117 | "\n", 118 | "```\n", 119 | "\n", 120 | "\n", 121 | "# Book of grudges\n", 122 | "* zero init for weights will cause symmetry effect\n", 123 | "* Too many filters for first 3x3 convolution - will lead to enormous matrix while there's just not enough relevant combinations of 3x3 images (overkill).\n", 124 | "* Usually the further you go, the more filters you need.\n", 125 | "* large filters (10x10 is generally a bad pactice, and you definitely need more than 10 of them\n", 126 | "* the second of 10x10 convolution gets 8x6x6 image as input, so it's technically unable to perform such convolution.\n", 127 | "* Softmax nonlinearity effectively makes only 1 or a few neurons from the entire layer to \"fire\", rendering 512-neuron layer almost useless. Softmax at the output layer is okay though\n", 128 | "* Dropout after probability prediciton is just lame. A few random classes get probability of 0, so your probabilities no longer sum to 1 and crossentropy goes -inf." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [] 139 | } 140 | ], 141 | "metadata": { 142 | "kernelspec": { 143 | "display_name": "Python [Root]", 144 | "language": "python", 145 | "name": "Python [Root]" 146 | }, 147 | "language_info": { 148 | "codemirror_mode": { 149 | "name": "ipython", 150 | "version": 2 151 | }, 152 | "file_extension": ".py", 153 | "mimetype": "text/x-python", 154 | "name": "python", 155 | "nbconvert_exporter": "python", 156 | "pygments_lexer": "ipython2", 157 | "version": "2.7.12" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 0 162 | } 163 | -------------------------------------------------------------------------------- /week2/README.md: -------------------------------------------------------------------------------- 1 | ## Materials 2 | * [__Lecture slides__](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture2.pdf&name=lecture2.pdf&c=58a61e22b9fb) 3 | * Our [lecture](https://yadi.sk/i/cVawsPkK3EtGJj),[seminar](https://yadi.sk/i/dQmolwOy3EtGNK) (russian) 4 | * [__main__] Lecture by David Silver (english): https://www.youtube.com/watch?v=PnHCvfgC_ZA 5 | * Alternative lecture by Pieter Abbeel (english): https://www.youtube.com/watch?v=ifma8G7LegE 6 | * Alternative lecture by John Schulmann (english): https://www.youtube.com/watch?v=IL3gVyJMmhg 7 | 8 | ## Bonus materials 9 | * Policy improvement theorems from Sutton book - http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node42.html 10 | * Lecture II by Dan Klein (english): https://www.youtube.com/watch?v=jUoZg513cdE 11 | * Qlearning guide from Habr (russian): https://habrahabr.ru/post/308094/ 12 | * A great turorial/assignment on value-based methods from CS294 - https://github.com/berkeleydeeprlcourse/homework/blob/master/hw2/HW2.ipynb 13 | 14 | ## Homework description: 15 | 16 | For ease of access, we have 2 versions of the same homework. They feature the same algorithmic part but a bit different examples. 17 | 18 | You can pick whichever one you prefer but mind the technical limitations. If you have a python2 on a local machine (NOT in docker), even if it's on Windows, we recommend the ./assignment one. 19 | 20 | ## ./assignment 21 | _this assignment borrows code from awesome [cs188](http://ai.berkeley.edu/project_overview.html)_ 22 | This homework assignment works on __python2 only__. If you stick to py3, consider alternative homework. Or just install it for this homework alone and remove afterwards. 23 | 24 | This homework also requires some physical display (e.g. laptop monitor). It won't work on binder VM / headless server. Please run it on laptop or consider ./alternative 25 | 26 | ### Part I (5 points) 27 | * Go to ./assignment, edit [__qlearningagents.py__](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/qlearningAgents.py) (see instructions inside) 28 | * Make sure you can tune agent to beat ./run_crawler.sh 29 | * on windows, just run `python crawler.py` from cmd in the project directory 30 | * other ./run* files are mostly for your amusement. 31 | * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/assignment/run_pacman.sh) 32 | * on windows, just copy the type `python pacman.py -p PacmanQAgent -x 2000 -n 2010 -l smallGrid` in cmd from assignemnt dir 33 | (YSDA/HSE) Please submit only qlearningAgents.py file and include a brief text report as comments in it. 34 | 35 | ### Part II (5+ points) 36 | _Please make a separate copy of qlearningAgents.py for this assignment_ 37 | 38 | The default tabular q-learning requires unrealistic amount of experience to learn anything useful on pacman tasks. This is mostly due to extremely large state space, combining positions of pacman, ghosts and all dots. 39 | 40 | To speed up training you will need to implement a preprocessor that extracts new discrete features from state space. You can design these features to account only for the most important stuff around pacman. This time, it's okay to use environment-specific duct tape :) 41 | 42 | Please read tips on how to solve them [__here__](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/homework_tips.md). Also, if you find some state spaces that work amaizingly good on pacman, weel free to propose a Pull Request with advices 43 | 44 | (HSE/YSDA) Please send us 45 | * The alternative qlearningAgents.py file (and any other files you modified) 46 | * A short description of what you did there 47 | * How to run it. Usually something like `python pacman.py -p PacmanQAgent -x SOMETHING -n SOMETHING -l __mediumClassic__ -SOMETHING SOMETHING ...` 48 | * End of train/test log (or even whole log), including at least last iteration of learning and final statistics (especially winrate) 49 | 50 | To get 5 points, your algorithm should solve __mediumGrid__ more than 50% times. Creative features and outstanding performance on __mediumClassic__ yields bonus points! 51 | 52 | ## ./alternative 53 | Alternative homework description: 54 | * Go to [the notebook](https://github.com/yandexdataschool/Practical_RL/blob/master/week2/alternative/homework.ipynb) 55 | * The assignment is described there. 56 | * If you use binder/server, see week1 for example on how to run CartPole and other envs. 57 | 58 | 59 | ### Grading (alternative) 60 | * 5 points for implementing q-learning and testing on taxi 61 | * 5 points for solving CartPole-v0 62 | * bonus tasks listed inside 63 | -------------------------------------------------------------------------------- /week3/qlearning.py: -------------------------------------------------------------------------------- 1 | """ 2 | Q-learning 3 | This file contains the same q-learning agent you implemented in the previous assignment. 4 | The only difference is that it doesn't need any other files with it, so you can use it as a standalone moule. 5 | 6 | Here's an example: 7 | >>>from qlearning import QLearningAgent 8 | 9 | >>>agent = QLearningAgent(alpha=0.5,epsilon=0.25,discount=0.99, 10 | getLegalActions = lambda s: actions_from_that_state) 11 | >>>action = agent.getAction(state) 12 | >>>agent.update(state,action, next_state,reward) 13 | >>>agent.epsilon *= 0.99 14 | """ 15 | 16 | import random,math 17 | 18 | import numpy as np 19 | from collections import defaultdict 20 | 21 | class QLearningAgent(): 22 | """ 23 | Q-Learning Agent 24 | 25 | The two main methods are 26 | - self.getAction(state) - returns agent's action in that state 27 | - self.update(state,action,nextState,reward) - returns agent's next action 28 | 29 | Functions you should use 30 | - self.getLegalActions(state) 31 | which returns legal actions for a state 32 | - self.getQValue(state,action) 33 | which returns Q(state,action) 34 | - self.setQValue(state,action,value) 35 | which sets Q(state,action) := value 36 | 37 | !!!Important!!! 38 | NOTE: please avoid using self._qValues directly to make code cleaner 39 | """ 40 | def __init__(self,alpha,epsilon,discount,getLegalActions): 41 | "We initialize agent and Q-values here." 42 | self.getLegalActions= getLegalActions 43 | self._qValues = defaultdict(lambda:defaultdict(lambda:0)) 44 | self.alpha = alpha 45 | self.epsilon = epsilon 46 | self.discount = discount 47 | 48 | def getQValue(self, state, action): 49 | """ 50 | Returns Q(state,action) 51 | """ 52 | return self._qValues[state][action] 53 | 54 | def setQValue(self,state,action,value): 55 | """ 56 | Sets the Qvalue for [state,action] to the given value 57 | """ 58 | self._qValues[state][action] = value 59 | 60 | #---------------------#start of your code#---------------------# 61 | 62 | def getValue(self, state): 63 | """ 64 | Returns max_action Q(state,action) 65 | where the max is over legal actions. 66 | """ 67 | 68 | possibleActions = self.getLegalActions(state) 69 | #If there are no legal actions, return 0.0 70 | if len(possibleActions) == 0: 71 | return 0.0 72 | 73 | "*** YOUR CODE HERE ***" 74 | return max([self.getQValue(state, a) for a in possibleActions]) 75 | 76 | def getPolicy(self, state): 77 | """ 78 | Compute the best action to take in a state. 79 | 80 | """ 81 | possibleActions = self.getLegalActions(state) 82 | 83 | #If there are no legal actions, return None 84 | if len(possibleActions) == 0: 85 | return None 86 | 87 | best_action = None 88 | 89 | best_action = possibleActions[np.argmax([self.getQValue(state, a) for a in possibleActions])] 90 | return best_action 91 | 92 | def getAction(self, state): 93 | """ 94 | Compute the action to take in the current state, including exploration. 95 | 96 | With probability self.epsilon, we should take a random action. 97 | otherwise - the best policy action (self.getPolicy). 98 | 99 | HINT: You might want to use util.flipCoin(prob) 100 | HINT: To pick randomly from a list, use random.choice(list) 101 | 102 | """ 103 | 104 | # Pick Action 105 | possibleActions = self.getLegalActions(state) 106 | action = None 107 | 108 | #If there are no legal actions, return None 109 | if len(possibleActions) == 0: 110 | return None 111 | 112 | #agent parameters: 113 | epsilon = self.epsilon 114 | 115 | if np.random.random()<=epsilon: 116 | return random.choice(possibleActions) 117 | else: 118 | action = self.getPolicy(state) 119 | return action 120 | 121 | def update(self, state, action, nextState, reward): 122 | """ 123 | You should do your Q-Value update here 124 | 125 | NOTE: You should never call this function, 126 | it will be called on your behalf 127 | 128 | 129 | """ 130 | #agent parameters 131 | gamma = self.discount 132 | learning_rate = self.alpha 133 | 134 | reference_qvalue = reward + gamma * self.getValue(nextState) 135 | updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue 136 | self.setQValue(state,action,updated_qvalue) 137 | 138 | 139 | #---------------------#end of your code#---------------------# 140 | 141 | 142 | -------------------------------------------------------------------------------- /yet_another_week/README.md: -------------------------------------------------------------------------------- 1 | In this week you can find several sections covering advanced topics in RL, along with less advanced topics that we couldn't squeeze into the main track 2 | 3 | ## Advanced policy gradient methods 4 | This section covers some steroids for policy gradient methods, along with a cool general trick called 5 | 6 | * Lecture on NPG and TRPO by J. Schulman - [video](https://www.youtube.com/watch?v=_t5fpZuuf-4) 7 | * Alternative lecture on TRPO and open problems by... J. schulman - [video](https://www.youtube.com/watch?v=gb5Q2XL5c8A) 8 | * Our [__slides__](https://yadi.sk/i/9j6S4WVp3HgEdn) on TRPO, video: [lecture](https://yadi.sk/i/1oyihBnm3HiKHm), [seminar](https://yadi.sk/i/b0ol2gUV3HiKKJ) (russian) 9 | * Original articles - [TRPO](https://arxiv.org/abs/1502.05477), [NPG](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf) 10 | 11 | 12 | * __Assignment:__ [seminar_TRPO.ipynb](https://github.com/yandexdataschool/Practical_RL/blob/master/yet_another_week/seminar_TRPO.ipynb) 13 | * TF version: [pending] 14 | 15 | ## Model-based RL: Planning 16 | * Planning by dynamic programming (D. Silver) - [video](https://www.youtube.com/watch?v=Nd1-UUMVfz4) 17 | * Planning via tree search [videos 2-6 from CS188](https://www.youtube.com/channel/UCHBzJsIcRIVuzzHVYabikTQ) 18 | * Our lecture: 19 | * Slides [part1](https://yadi.sk/i/3PM9zCP33J3ub3) (intro), [part2](https://yadi.sk/i/M03xvZ2y3JMQre) (pomdp) 20 | * [Lecture](https://yadi.sk/i/lOAUu7o13JBHFz) & [seminar](https://yadi.sk/i/bkmjEZrk3JBHGF) 21 | * Monte-carlo tree search 22 | * Udacity video on monte-carlo tree search (first part of a chain) - [video](https://www.youtube.com/watch?v=onBYsen2_eA) 23 | * Reminder: UCB-1 - [slides](https://www.cs.bham.ac.uk/internal/courses/robotics/lectures/ucb1.pdf) 24 | * Monte-carlo tree search step-by-step by J.Levine - [video](https://www.youtube.com/watch?v=UXW2yZndl7U) 25 | * Guide to MCTS (monte-carlo tree search) - [post](http://www.cameronius.com/research/mcts/about/index.html) 26 | * Another guide to MCTS - [url](https://jeffbradberry.com/posts/2015/09/intro-to-monte-carlo-tree-search/) 27 | * Integrating learning and planning (D. Silver) - [video](https://www.youtube.com/watch?v=ItMutbeOHtc&t=1241s) 28 | 29 | * __Assignment:__ [seminar_MCTS.ipynb](https://github.com/yandexdataschool/Practical_RL/blob/master/yet_another_week/seminar_MCTS.ipynb) 30 | 31 | * Approximating the MCTS optimal actions - 5vision solution for deephack.RL, code by Mikhail Pavlov - [repo](https://github.com/5vision/uct_atari) 32 | 33 | ## Reinforcement learning in large/continuous action spaces 34 | While you already know algorithms that will work with continuously many actions, it can't hurt to learn something more specialized. 35 | * Deterministic policy gradient - [article](https://arxiv.org/pdf/1512.07679.pdf), [post+code](https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html) 36 | * Stochastic value gradient - [article](https://arxiv.org/abs/1510.09142) 37 | * Q-learning with normalized advantage functions - [article](https://arxiv.org/abs/1603.00748), [code1](https://github.com/carpedm20/NAF-tensorflow), [code2](http://bit.ly/2qx2087) 38 | * Embedding large discrete action spaces for RL - [article](https://arxiv.org/pdf/1512.07679.pdf) 39 | * Lecture by A. Seleznev, 5vision (russian) - [video](www.youtube.com/watch?v=j1L2FnanXPo&t=119m45s) 40 | 41 | ## Other 42 | * Learning by imitation - [video](https://www.youtube.com/watch?v=kl_G95uKTHw), [assignment](http://rll.berkeley.edu/deeprlcourse/docs/hw1.pdf)(berkeley cs294) 43 | * Knowledge transfer in RL - [video](https://www.youtube.com/watch?v=Hx4XpVdJOI0)(berkeley cs294) 44 | * Inverse reinforcement learning - [video](https://www.youtube.com/watch?v=J2blDuU3X1I) 45 | * Hierarchical reinforcemnt learning - [pending] 46 | * [Your contribution] 47 | 48 | ## A list of lists 49 | * [awesome_rl](https://github.com/aikorea/awesome-rl/) - a curated list of resources dedicated to reinforcement learning. 50 | * [junhyukoh's list](https://github.com/junhyukoh/deep-reinforcement-learning-papers) 51 | * [muupan's list](https://github.com/muupan/deep-reinforcement-learning-papers) 52 | * Courses: 53 | * [CS294: deep reinforcement learning](http://rll.berkeley.edu/deeprlcourse/) 54 | * [Silver's RL course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) 55 | * [Sutton's book, 2nd edition](http://incompleteideas.net/sutton/book/the-book-2nd.html) 56 | * [Implementations of many basic RL algorithms (raw and/or tensorflow)](https://github.com/dennybritz/reinforcement-learning) 57 | * Reddit: [General ML](https://www.reddit.com/r/MachineLearning/), [RL](https://www.reddit.com/r/reinforcementlearning/), [CS294](https://www.reddit.com/r/berkeleydeeprlcourse/) 58 | * [This great link you could have contributed] 59 | 60 | -------------------------------------------------------------------------------- /week3/expected_value_sarsa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Expected Value SARSA 3 | This file builds upon the same functions as Q-learning agent (qlearning.py). 4 | 5 | [assignment] 6 | The only thing you must implement is the getValue method. 7 | - Recall that V(s) in SARSA is not the maximal but the expected Q-value. 8 | - The expectation should be done under agent's policy (e-greedy). 9 | 10 | 11 | Here's usage example: 12 | >>>from expected_value_sarsa import EVSarsaAgent 13 | 14 | >>>agent = EVSarsaAgent(alpha=0.5,epsilon=0.25,discount=0.99, 15 | getLegalActions = lambda s: actions_from_that_state) 16 | >>>action = agent.getAction(state) 17 | >>>agent.update(state,action, next_state,reward) 18 | >>>agent.epsilon *= 0.99 19 | """ 20 | 21 | import random,math 22 | 23 | import numpy as np 24 | from collections import defaultdict 25 | 26 | class EVSarsaAgent(): 27 | """ 28 | Expected Value SARSA Agent. 29 | 30 | The two main methods are 31 | - self.getAction(state) - returns agent's action in that state 32 | - self.update(state,action,nextState,reward) - returns agent's next action 33 | 34 | Instance variables you have access to 35 | - self.epsilon (exploration prob) 36 | - self.alpha (learning rate) 37 | - self.discount (discount rate aka gamma) 38 | 39 | """ 40 | def __init__(self,alpha,epsilon,discount,getLegalActions): 41 | "We initialize agent and Q-values here." 42 | self.getLegalActions= getLegalActions 43 | self._qValues = defaultdict(lambda:defaultdict(lambda:0)) 44 | self.alpha = alpha 45 | self.epsilon = epsilon 46 | self.discount = discount 47 | 48 | def getQValue(self, state, action): 49 | """ 50 | Returns Q(state,action) 51 | """ 52 | return self._qValues[state][action] 53 | 54 | def setQValue(self,state,action,value): 55 | """ 56 | Sets the Qvalue for [state,action] to the given value 57 | """ 58 | self._qValues[state][action] = value 59 | 60 | #---------------------#start of your code#---------------------# 61 | 62 | def getValue(self, state): 63 | """ 64 | Returns V(s) according to expected value SARSA algorithm 65 | This should be equal to expected action q-value over action probabilities defined 66 | by epsilon-greedy policy with current epsilon. 67 | """ 68 | 69 | possibleActions = self.getLegalActions(state) 70 | #If there are no legal actions, return 0.0 71 | if len(possibleActions) == 0: 72 | return 0.0 73 | 74 | #You'll need this to estimate action probabilities 75 | epsilon = self.epsilon 76 | 77 | value = 78 | return value 79 | 80 | def getPolicy(self, state): 81 | """ 82 | Compute the best action to take in a state. 83 | 84 | """ 85 | possibleActions = self.getLegalActions(state) 86 | 87 | #If there are no legal actions, return None 88 | if len(possibleActions) == 0: 89 | return None 90 | 91 | best_action = None 92 | 93 | best_action = possibleActions[np.argmax([self.getQValue(state, a) for a in possibleActions])] 94 | return best_action 95 | 96 | def getAction(self, state): 97 | """ 98 | Compute the action to take in the current state, including exploration. 99 | 100 | With probability self.epsilon, we should take a random action. 101 | otherwise - the best policy action (self.getPolicy). 102 | 103 | HINT: You might want to use util.flipCoin(prob) 104 | HINT: To pick randomly from a list, use random.choice(list) 105 | 106 | """ 107 | 108 | # Pick Action 109 | possibleActions = self.getLegalActions(state) 110 | action = None 111 | 112 | #If there are no legal actions, return None 113 | if len(possibleActions) == 0: 114 | return None 115 | 116 | #agent parameters: 117 | epsilon = self.epsilon 118 | 119 | if np.random.random()<=epsilon: 120 | return random.choice(possibleActions) 121 | else: 122 | action = self.getPolicy(state) 123 | return action 124 | 125 | def update(self, state, action, nextState, reward): 126 | """ 127 | You should do your Q-Value update here 128 | 129 | NOTE: You should never call this function, 130 | it will be called on your behalf 131 | 132 | 133 | """ 134 | #agent parameters 135 | gamma = self.discount 136 | learning_rate = self.alpha 137 | 138 | reference_qvalue = reward + gamma * self.getValue(nextState) 139 | updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue 140 | self.setQValue(state,action,updated_qvalue) 141 | 142 | 143 | #---------------------#end of your code#---------------------# 144 | 145 | 146 | -------------------------------------------------------------------------------- /week2/assignment/qlearningAgents.py: -------------------------------------------------------------------------------- 1 | # qlearningAgents.py 2 | # ------------------ 3 | ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 4 | 5 | from game import * 6 | from learningAgents import ReinforcementAgent 7 | from featureExtractors import * 8 | 9 | import random,util,math 10 | from collections import defaultdict 11 | 12 | class QLearningAgent(ReinforcementAgent): 13 | """ 14 | Q-Learning Agent 15 | 16 | Instance variables you have access to 17 | - self.epsilon (exploration prob) 18 | - self.alpha (learning rate) 19 | - self.discount (discount rate aka gamma) 20 | 21 | Functions you should use 22 | - self.getLegalActions(state) 23 | which returns legal actions for a state 24 | - self.getQValue(state,action) 25 | which returns Q(state,action) 26 | - self.setQValue(state,action,value) 27 | which sets Q(state,action) := value 28 | 29 | !!!Important!!! 30 | NOTE: please avoid using self._qValues directly to make code cleaner 31 | """ 32 | def __init__(self, **args): 33 | "We initialize agent and Q-values here." 34 | ReinforcementAgent.__init__(self, **args) 35 | self._qValues = defaultdict(lambda:defaultdict(lambda:0)) 36 | 37 | 38 | def getQValue(self, state, action): 39 | """ 40 | Returns Q(state,action) 41 | """ 42 | return self._qValues[state][action] 43 | 44 | def setQValue(self,state,action,value): 45 | """ 46 | Sets the Qvalue for [state,action] to the given value 47 | """ 48 | self._qValues[state][action] = value 49 | 50 | #---------------------#start of your code#---------------------# 51 | 52 | def getValue(self, state): 53 | """ 54 | Returns max_action Q(state,action) 55 | where the max is over legal actions. 56 | """ 57 | 58 | possibleActions = self.getLegalActions(state) 59 | #If there are no legal actions, return 0.0 60 | if len(possibleActions) == 0: 61 | return 0.0 62 | 63 | "*** YOUR CODE HERE ***" 64 | raise NotImplementedError 65 | 66 | return 0. 67 | 68 | def getPolicy(self, state): 69 | """ 70 | Compute the best action to take in a state. 71 | 72 | """ 73 | possibleActions = self.getLegalActions(state) 74 | 75 | #If there are no legal actions, return None 76 | if len(possibleActions) == 0: 77 | return None 78 | 79 | best_action = None 80 | 81 | "*** YOUR CODE HERE ***" 82 | raise NotImplementedError 83 | 84 | return best_action 85 | 86 | def getAction(self, state): 87 | """ 88 | Compute the action to take in the current state, including exploration. 89 | 90 | With probability self.epsilon, we should take a random action. 91 | otherwise - the best policy action (self.getPolicy). 92 | 93 | HINT: You might want to use util.flipCoin(prob) 94 | HINT: To pick randomly from a list, use random.choice(list) 95 | 96 | """ 97 | 98 | # Pick Action 99 | possibleActions = self.getLegalActions(state) 100 | action = None 101 | 102 | #If there are no legal actions, return None 103 | if len(possibleActions) == 0: 104 | return None 105 | 106 | #agent parameters: 107 | epsilon = self.epsilon 108 | 109 | "*** YOUR CODE HERE ***" 110 | raise NotImplementedError 111 | 112 | return action 113 | 114 | def update(self, state, action, nextState, reward): 115 | """ 116 | You should do your Q-Value update here 117 | 118 | NOTE: You should never call this function, 119 | it will be called on your behalf 120 | 121 | 122 | """ 123 | #agent parameters 124 | gamma = self.discount 125 | learning_rate = self.alpha 126 | 127 | "*** YOUR CODE HERE ***" 128 | raise NotImplementedError 129 | 130 | reference_qvalue = PleaseImplementMe 131 | updated_qvalue = PleaseImplementMe 132 | 133 | self.setQValue(PleaseImplementMe,PleaseImplementMe,updated_qvalue) 134 | 135 | 136 | #---------------------#end of your code#---------------------# 137 | 138 | 139 | 140 | class PacmanQAgent(QLearningAgent): 141 | "Exactly the same as QLearningAgent, but with different default parameters" 142 | 143 | def __init__(self, epsilon=0.05,gamma=0.8,alpha=0.2, numTraining=0, **args): 144 | """ 145 | These default parameters can be changed from the pacman.py command line. 146 | For example, to change the exploration rate, try: 147 | python pacman.py -p PacmanQLearningAgent -a epsilon=0.1 148 | 149 | alpha - learning rate 150 | epsilon - exploration rate 151 | gamma - discount factor 152 | numTraining - number of training episodes, i.e. no learning after these many episodes 153 | """ 154 | args['epsilon'] = epsilon 155 | args['gamma'] = gamma 156 | args['alpha'] = alpha 157 | args['numTraining'] = numTraining 158 | self.index = 0 # This is always Pacman 159 | QLearningAgent.__init__(self, **args) 160 | 161 | def getAction(self, state): 162 | """ 163 | Simply calls the getAction method of QLearningAgent and then 164 | informs parent of action for Pacman. Do not change or remove this 165 | method. 166 | """ 167 | action = QLearningAgent.getAction(self,state) 168 | self.doAction(state,action) 169 | return action 170 | 171 | 172 | 173 | class ApproximateQAgent(PacmanQAgent): 174 | pass 175 | -------------------------------------------------------------------------------- /week2/assignment/layout.py: -------------------------------------------------------------------------------- 1 | # layout.py 2 | # --------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from util import manhattanDistance 10 | from game import Grid 11 | import os 12 | import random 13 | 14 | VISIBILITY_MATRIX_CACHE = {} 15 | 16 | class Layout: 17 | """ 18 | A Layout manages the static information about the game board. 19 | """ 20 | 21 | def __init__(self, layoutText): 22 | self.width = len(layoutText[0]) 23 | self.height= len(layoutText) 24 | self.walls = Grid(self.width, self.height, False) 25 | self.food = Grid(self.width, self.height, False) 26 | self.capsules = [] 27 | self.agentPositions = [] 28 | self.numGhosts = 0 29 | self.processLayoutText(layoutText) 30 | self.layoutText = layoutText 31 | # self.initializeVisibilityMatrix() 32 | 33 | def getNumGhosts(self): 34 | return self.numGhosts 35 | 36 | def initializeVisibilityMatrix(self): 37 | global VISIBILITY_MATRIX_CACHE 38 | if reduce(str.__add__, self.layoutText) not in VISIBILITY_MATRIX_CACHE: 39 | from game import Directions 40 | vecs = [(-0.5,0), (0.5,0),(0,-0.5),(0,0.5)] 41 | dirs = [Directions.NORTH, Directions.SOUTH, Directions.WEST, Directions.EAST] 42 | vis = Grid(self.width, self.height, {Directions.NORTH:set(), Directions.SOUTH:set(), Directions.EAST:set(), Directions.WEST:set(), Directions.STOP:set()}) 43 | for x in range(self.width): 44 | for y in range(self.height): 45 | if self.walls[x][y] == False: 46 | for vec, direction in zip(vecs, dirs): 47 | dx, dy = vec 48 | nextx, nexty = x + dx, y + dy 49 | while (nextx + nexty) != int(nextx) + int(nexty) or not self.walls[int(nextx)][int(nexty)] : 50 | vis[x][y][direction].add((nextx, nexty)) 51 | nextx, nexty = x + dx, y + dy 52 | self.visibility = vis 53 | VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] = vis 54 | else: 55 | self.visibility = VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] 56 | 57 | def isWall(self, pos): 58 | x, col = pos 59 | return self.walls[x][col] 60 | 61 | def getRandomLegalPosition(self): 62 | x = random.choice(range(self.width)) 63 | y = random.choice(range(self.height)) 64 | while self.isWall( (x, y) ): 65 | x = random.choice(range(self.width)) 66 | y = random.choice(range(self.height)) 67 | return (x,y) 68 | 69 | def getRandomCorner(self): 70 | poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)] 71 | return random.choice(poses) 72 | 73 | def getFurthestCorner(self, pacPos): 74 | poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)] 75 | dist, pos = max([(manhattanDistance(p, pacPos), p) for p in poses]) 76 | return pos 77 | 78 | def isVisibleFrom(self, ghostPos, pacPos, pacDirection): 79 | row, col = [int(x) for x in pacPos] 80 | return ghostPos in self.visibility[row][col][pacDirection] 81 | 82 | def __str__(self): 83 | return "\n".join(self.layoutText) 84 | 85 | def deepCopy(self): 86 | return Layout(self.layoutText[:]) 87 | 88 | def processLayoutText(self, layoutText): 89 | """ 90 | Coordinates are flipped from the input format to the (x,y) convention here 91 | 92 | The shape of the maze. Each character 93 | represents a different type of object. 94 | % - Wall 95 | . - Food 96 | o - Capsule 97 | G - Ghost 98 | P - Pacman 99 | Other characters are ignored. 100 | """ 101 | maxY = self.height - 1 102 | for y in range(self.height): 103 | for x in range(self.width): 104 | layoutChar = layoutText[maxY - y][x] 105 | self.processLayoutChar(x, y, layoutChar) 106 | self.agentPositions.sort() 107 | self.agentPositions = [ ( i == 0, pos) for i, pos in self.agentPositions] 108 | 109 | def processLayoutChar(self, x, y, layoutChar): 110 | if layoutChar == '%': 111 | self.walls[x][y] = True 112 | elif layoutChar == '.': 113 | self.food[x][y] = True 114 | elif layoutChar == 'o': 115 | self.capsules.append((x, y)) 116 | elif layoutChar == 'P': 117 | self.agentPositions.append( (0, (x, y) ) ) 118 | elif layoutChar in ['G']: 119 | self.agentPositions.append( (1, (x, y) ) ) 120 | self.numGhosts += 1 121 | elif layoutChar in ['1', '2', '3', '4']: 122 | self.agentPositions.append( (int(layoutChar), (x,y))) 123 | self.numGhosts += 1 124 | def getLayout(name, back = 2): 125 | if name.endswith('.lay'): 126 | layout = tryToLoad('layouts/' + name) 127 | if layout == None: layout = tryToLoad(name) 128 | else: 129 | layout = tryToLoad('layouts/' + name + '.lay') 130 | if layout == None: layout = tryToLoad(name + '.lay') 131 | if layout == None and back >= 0: 132 | curdir = os.path.abspath('.') 133 | os.chdir('..') 134 | layout = getLayout(name, back -1) 135 | os.chdir(curdir) 136 | return layout 137 | 138 | def tryToLoad(fullname): 139 | if(not os.path.exists(fullname)): return None 140 | f = open(fullname) 141 | try: return Layout([line.strip() for line in f]) 142 | finally: f.close() -------------------------------------------------------------------------------- /week9/bayes.py: -------------------------------------------------------------------------------- 1 | """ 2 | A single-file module that makes your lasagne network into a bayesian neural net. 3 | Originally created by github.com/ferrine , rewritten by github.com/justheuristic for simplicity 4 | 5 | See example in the notebook 6 | """ 7 | 8 | import numpy as np 9 | 10 | from theano import tensor as T 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 12 | 13 | import lasagne 14 | from lasagne import init 15 | from lasagne.random import get_rng 16 | 17 | from functools import wraps 18 | 19 | __all__ = ['NormalApproximation','get_var_cost','bbpwrap'] 20 | 21 | 22 | 23 | class NormalApproximation(object): 24 | def __init__(self, mu=0, std=np.exp(-3),seed=None): 25 | """ 26 | Approximation that samples network weights from factorized normal distribution. 27 | 28 | :param mu: prior mean for gaussian weights 29 | :param std: prior std for gaussian weights 30 | :param seed: random seed 31 | """ 32 | self.prior_mu = mu 33 | self.prior_std = std 34 | self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579)) 35 | 36 | def log_normal(self,x, mean, std, eps=0.0): 37 | """computes log-proba of normal distribution""" 38 | std += eps 39 | return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - (x - mean) ** 2 / (2 * std ** 2) 40 | 41 | def log_prior(self, weights): 42 | """ 43 | Logarithm of prior probabilities for weights: 44 | log P(weights) aka log P(theta) 45 | """ 46 | return self.log_normal(weights, self.prior_mu, self.prior_std) 47 | 48 | def log_posterior_approx(self,weights, mean, rho): 49 | """ 50 | Logarithm of ELBO on posterior probabilities: 51 | log q(weights|learned mu and rho) aka log q(theta|x) 52 | """ 53 | std = T.log1p(T.exp(rho)) #rho to std 54 | return self.log_normal(weights, mean, std) 55 | 56 | def __call__(self, layer, spec, shape, name=None, **tags): 57 | # case when user uses default init specs 58 | assert tags.get('variational',False) == True, "Please declare param as variational to avoid confusion" 59 | 60 | if not isinstance(spec, dict): 61 | initial_rho = np.log(np.expm1(self.prior_std)) #std to rho 62 | assert np.isfinite(initial_rho),"too small std to initialize correctly. Please pass explicit"\ 63 | " initializer (dict with {'mu':mu_init, 'rho':rho_init})." 64 | spec = {'mu': spec,'rho':init.Constant(initial_rho)} 65 | 66 | 67 | mu_spec,rho_spec = spec['mu'],spec['rho'] 68 | 69 | rho = layer.add_param(rho_spec, shape,name=(name or 'unk')+'.rho', **tags) 70 | mean = layer.add_param(mu_spec, shape,name=(name or 'unk')+'.mu', **tags) 71 | 72 | #Reparameterization trick 73 | e = self.srng.normal(shape, std=1) 74 | W = mean + T.log1p(T.exp(rho)) * e 75 | 76 | #KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka variational cost 77 | q_p = T.sum(self.log_posterior_approx(W, mean, rho) - self.log_prior(W)) 78 | 79 | #accumulate variational cost 80 | layer._bbwrap_var_cost += q_p 81 | return W 82 | 83 | 84 | 85 | def get_var_cost(layer_or_layers,treat_as_input=None): 86 | """ 87 | Returns total variational cost aka KL(q(theta|x)||p(theta)) for all layers in the network 88 | 89 | :param layer_or_layers: top layer(s) of your network, just like with lasagne.layers.get_output 90 | :param treat_as_input: don't accumulate over layers below these layers. See same param for lasagne.layers.get_all_layers 91 | 92 | Alternatively, one can manually get weights for one layer via layer.get_var_cost() 93 | """ 94 | cost = 0 95 | for layer in lasagne.layers.get_all_layers(layer_or_layers,treat_as_input): 96 | if hasattr(layer, 'get_var_cost'): #if layer is bayesian or pretends so 97 | cost += layer.get_var_cost() 98 | return cost 99 | 100 | def bbpwrap(approximation=NormalApproximation()): 101 | """ 102 | A decorator that makes arbitrary lasagne layer into a bayesian network layer: 103 | BayesDenseLayer = bbwrap()(DenseLayer) 104 | or more verbosely, 105 | @bbpwrap(NormalApproximation(pstd=0.01)) 106 | BayesDenseLayer(DenseLayer): 107 | pass 108 | 109 | """ 110 | 111 | def decorator(cls): 112 | def add_param_wrap(add_param): 113 | @wraps(add_param) 114 | def wrapped(self, spec, shape, name=None, **tags): 115 | # we should take care about some user specification 116 | # to avoid bbp hook just set tags['variational'] = True 117 | if not tags.get('trainable', True) or tags.get('variational', False): 118 | return add_param(self, spec, shape, name, **tags) 119 | else: 120 | # we declare that params we add next 121 | # are the ones we need to fit the distribution 122 | # they don't need to be regularized, strictly 123 | tags['variational'] = True 124 | tags['regularizable'] = False 125 | param = self.approximation(self, spec, shape, name, **tags) 126 | return param 127 | return wrapped 128 | 129 | def get_var_cost(self): 130 | """ 131 | Returns total variational cost aka KL(q(theta|x)||p(theta)) for this layer. 132 | Alternatively, use function get_var_cost(layer) to get total cost for all layers below this one. 133 | """ 134 | return self._bbwrap_var_cost 135 | 136 | 137 | cls.approximation = approximation 138 | cls._bbwrap_var_cost=0 139 | cls.add_param = add_param_wrap(cls.add_param) 140 | cls.get_var_cost = get_var_cost 141 | return cls 142 | 143 | 144 | return decorator 145 | -------------------------------------------------------------------------------- /week2/assignment/learningAgents.py: -------------------------------------------------------------------------------- 1 | # learningAgents.py 2 | # ----------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | from game import Directions, Agent, Actions 10 | 11 | import random,util,time 12 | 13 | class ValueEstimationAgent(Agent): 14 | """ 15 | Abstract agent which assigns values to (state,action) 16 | Q-Values for an environment. As well as a value to a 17 | state and a policy given respectively by, 18 | 19 | V(s) = max_{a in actions} Q(s,a) 20 | policy(s) = arg_max_{a in actions} Q(s,a) 21 | 22 | Both ValueIterationAgent and QLearningAgent inherit 23 | from this agent. While a ValueIterationAgent has 24 | a model of the environment via a MarkovDecisionProcess 25 | (see mdp.py) that is used to estimate Q-Values before 26 | ever actually acting, the QLearningAgent estimates 27 | Q-Values while acting in the environment. 28 | """ 29 | 30 | def __init__(self, alpha=1.0, epsilon=0.05, gamma=0.8, numTraining = 10): 31 | """ 32 | Sets options, which can be passed in via the Pacman command line using -a alpha=0.5,... 33 | alpha - learning rate 34 | epsilon - exploration rate 35 | gamma - discount factor 36 | numTraining - number of training episodes, i.e. no learning after these many episodes 37 | """ 38 | self.alpha = float(alpha) 39 | self.epsilon = float(epsilon) 40 | self.discount = float(gamma) 41 | self.numTraining = int(numTraining) 42 | 43 | #################################### 44 | # Override These Functions # 45 | #################################### 46 | def getQValue(self, state, action): 47 | """ 48 | Should return Q(state,action) 49 | """ 50 | util.raiseNotDefined() 51 | 52 | def getValue(self, state): 53 | """ 54 | What is the value of this state under the best action? 55 | Concretely, this is given by 56 | 57 | V(s) = max_{a in actions} Q(s,a) 58 | """ 59 | util.raiseNotDefined() 60 | 61 | def getPolicy(self, state): 62 | """ 63 | What is the best action to take in the state. Note that because 64 | we might want to explore, this might not coincide with getAction 65 | Concretely, this is given by 66 | 67 | policy(s) = arg_max_{a in actions} Q(s,a) 68 | 69 | If many actions achieve the maximal Q-value, 70 | it doesn't matter which is selected. 71 | """ 72 | util.raiseNotDefined() 73 | 74 | def getAction(self, state): 75 | """ 76 | state: can call state.getLegalActions() 77 | Choose an action and return it. 78 | """ 79 | util.raiseNotDefined() 80 | 81 | class ReinforcementAgent(ValueEstimationAgent): 82 | """ 83 | Abstract Reinforcemnt Agent: A ValueEstimationAgent 84 | which estimates Q-Values (as well as policies) from experience 85 | rather than a model 86 | 87 | What you need to know: 88 | - The environment will call 89 | observeTransition(state,action,nextState,deltaReward), 90 | which will call update(state, action, nextState, deltaReward) 91 | which you should override. 92 | - Use self.getLegalActions(state) to know which actions 93 | are available in a state 94 | """ 95 | #################################### 96 | # Override These Functions # 97 | #################################### 98 | 99 | def update(self, state, action, nextState, reward): 100 | """ 101 | This class will call this function, which you write, after 102 | observing a transition and reward 103 | """ 104 | util.raiseNotDefined() 105 | 106 | #################################### 107 | # Read These Functions # 108 | #################################### 109 | 110 | def getLegalActions(self,state): 111 | """ 112 | Get the actions available for a given 113 | state. This is what you should use to 114 | obtain legal actions for a state 115 | """ 116 | return self.actionFn(state) 117 | 118 | def observeTransition(self, state,action,nextState,deltaReward): 119 | """ 120 | Called by environment to inform agent that a transition has 121 | been observed. This will result in a call to self.update 122 | on the same arguments 123 | 124 | NOTE: Do *not* override or call this function 125 | """ 126 | self.episodeRewards += deltaReward 127 | self.update(state,action,nextState,deltaReward) 128 | 129 | def startEpisode(self): 130 | """ 131 | Called by environment when new episode is starting 132 | """ 133 | self.lastState = None 134 | self.lastAction = None 135 | self.episodeRewards = 0.0 136 | 137 | def stopEpisode(self): 138 | """ 139 | Called by environment when episode is done 140 | """ 141 | if self.episodesSoFar < self.numTraining: 142 | self.accumTrainRewards += self.episodeRewards 143 | else: 144 | self.accumTestRewards += self.episodeRewards 145 | self.episodesSoFar += 1 146 | if self.episodesSoFar >= self.numTraining: 147 | # Take off the training wheels 148 | self.epsilon = 0.0 # no exploration 149 | self.alpha = 0.0 # no learning 150 | 151 | def isInTraining(self): 152 | return self.episodesSoFar < self.numTraining 153 | 154 | def isInTesting(self): 155 | return not self.isInTraining() 156 | 157 | def __init__(self, actionFn = None, numTraining=100, epsilon=0.5, alpha=0.5, gamma=1): 158 | """ 159 | actionFn: Function which takes a state and returns the list of legal actions 160 | 161 | alpha - learning rate 162 | epsilon - exploration rate 163 | gamma - discount factor 164 | numTraining - number of training episodes, i.e. no learning after these many episodes 165 | """ 166 | if actionFn == None: 167 | actionFn = lambda state: state.getLegalActions() 168 | self.actionFn = actionFn 169 | self.episodesSoFar = 0 170 | self.accumTrainRewards = 0.0 171 | self.accumTestRewards = 0.0 172 | self.numTraining = int(numTraining) 173 | self.epsilon = float(epsilon) 174 | self.alpha = float(alpha) 175 | self.discount = float(gamma) 176 | 177 | ################################ 178 | # Controls needed for Crawler # 179 | ################################ 180 | def setEpsilon(self, epsilon): 181 | self.epsilon = epsilon 182 | 183 | def setLearningRate(self, alpha): 184 | self.alpha = alpha 185 | 186 | def setDiscount(self, discount): 187 | self.discount = discount 188 | 189 | def doAction(self,state,action): 190 | """ 191 | Called by inherited class when 192 | an action is taken in a state 193 | """ 194 | self.lastState = state 195 | self.lastAction = action 196 | 197 | ################### 198 | # Pacman Specific # 199 | ################### 200 | def observationFunction(self, state): 201 | """ 202 | This is where we ended up after our last action. 203 | The simulation should somehow ensure this is called 204 | """ 205 | if not self.lastState is None: 206 | reward = state.getScore() - self.lastState.getScore() 207 | self.observeTransition(self.lastState, self.lastAction, state, reward) 208 | return state 209 | 210 | def registerInitialState(self, state): 211 | self.startEpisode() 212 | if self.episodesSoFar == 0: 213 | print 'Beginning %d episodes of Training' % (self.numTraining) 214 | 215 | def final(self, state): 216 | """ 217 | Called by Pacman game at the terminal state 218 | """ 219 | deltaReward = state.getScore() - self.lastState.getScore() 220 | self.observeTransition(self.lastState, self.lastAction, state, deltaReward) 221 | self.stopEpisode() 222 | 223 | # Make sure we have this var 224 | if not 'episodeStartTime' in self.__dict__: 225 | self.episodeStartTime = time.time() 226 | if not 'lastWindowAccumRewards' in self.__dict__: 227 | self.lastWindowAccumRewards = 0.0 228 | self.lastWindowAccumRewards += state.getScore() 229 | 230 | NUM_EPS_UPDATE = 100 231 | if self.episodesSoFar % NUM_EPS_UPDATE == 0: 232 | print 'Reinforcement Learning Status:' 233 | windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE) 234 | if self.episodesSoFar <= self.numTraining: 235 | trainAvg = self.accumTrainRewards / float(self.episodesSoFar) 236 | print '\tCompleted %d out of %d training episodes' % ( 237 | self.episodesSoFar,self.numTraining) 238 | print '\tAverage Rewards over all training: %.2f' % ( 239 | trainAvg) 240 | else: 241 | testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining) 242 | print '\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining) 243 | print '\tAverage Rewards over testing: %.2f' % testAvg 244 | print '\tAverage Rewards for last %d episodes: %.2f' % ( 245 | NUM_EPS_UPDATE,windowAvg) 246 | print '\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime) 247 | self.lastWindowAccumRewards = 0.0 248 | self.episodeStartTime = time.time() 249 | 250 | if self.episodesSoFar == self.numTraining: 251 | msg = 'Training Done (turning off epsilon and alpha)' 252 | print '%s\n%s' % (msg,'-' * len(msg)) 253 | -------------------------------------------------------------------------------- /week8/8.2_bonus.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Week8 bonus descriptions\n", 8 | "\n", 9 | "Here are some cool mini-projects you can try to dive deeper into the topic.\n", 10 | "\n", 11 | "## More metrics: BLEU (5+ pts)\n", 12 | "\n", 13 | "Pick BLEU or any other relevant metric, e.g. BLEU (e.g. from `nltk.bleu_score`).\n", 14 | "* Train model to maximize BLEU directly\n", 15 | "* How does levenshtein behave when maximizing BLEU and vice versa?\n", 16 | "* Compare this with how they behave when optimizing likelihood. \n", 17 | "\n", 18 | "(use default parameters for bleu: 4-gram, uniform weights)\n", 19 | "\n", 20 | "## Actor-critic (5+++ pts)\n", 21 | "\n", 22 | "While self-critical training provides a large reduction of gradient variance, it has a few drawbacks:\n", 23 | "- It requires a lot of additional computation during training\n", 24 | "- It doesn't adjust V(s) between decoder steps. (one value per sequence)\n", 25 | "\n", 26 | "There's a more general way of doing the same thing: learned baselines, also known as __advantage actor-critic__.\n", 27 | "\n", 28 | "There are two main ways to apply that:\n", 29 | "- __naive way__: compute V(s) once per training example.\n", 30 | " - This only requires additional 1-unit linear dense layer that grows out of encoder, estimating V(s)\n", 31 | " - (implement this to get main points)\n", 32 | "- __every step__: compute V(s) on each decoder step\n", 33 | " - Again it's just an 1-unit dense layer (no nonlinearity), but this time it's inside decoder recurrence.\n", 34 | " - (+3 pts additional for this guy)\n", 35 | "\n", 36 | "In both cases, you should train V(s) to minimize squared error $(V(s) - R(s,a))^2$ with R being actual levenshtein.\n", 37 | "You can then use $ A(s,a) = (R(s,a) - const(V(s))) $ for policy gradient.\n", 38 | "\n", 39 | "There's also one particularly interesting approach (+5 additional pts):\n", 40 | "- __combining SCST and actor-critic__:\n", 41 | " - compute baseline $V(s)$ via self-critical sequence training (just like in main assignment)\n", 42 | " - learn correction $ C(s,a_{:t}) = R(s,a) - V(s) $ by minimizing $(R(s,a) - V(s) - C(s,a_{:t}))^2 $\n", 43 | " - use $ A(s,a_{:t}) = R(s,a) - V(s) - const(C(s,a_{:t})) $\n", 44 | "\n", 45 | "\n", 46 | "\n", 47 | "## Implement attention (5+++ pts)\n", 48 | "\n", 49 | "Some seq2seq tasks can benefit from the attention mechanism. In addition to taking the _last_ time-step of encoder hidden state, we can allow decoder to peek on any time-step of his choice.\n", 50 | "\n", 51 | "![img](https://s30.postimg.org/f8um3kt5d/google_seq2seq_attention.gif)\n", 52 | "\n", 53 | "\n", 54 | "#### Recommended steps:\n", 55 | "__1)__ Modify encoder-decoder\n", 56 | "\n", 57 | "Learn to feed the entire encoder into the decoder. You can do so by sending encoder rnn layer directly into decoder (make sure there's no `only_return_final=True` for encoder rnn layer).\n", 58 | "\n", 59 | "```\n", 60 | "class decoder:\n", 61 | " ...\n", 62 | " encoder_rnn_input = InputLayer(encoder.rnn.output_shape, name='encoder rnn input for decoder')\n", 63 | " ...\n", 64 | " \n", 65 | "#decoder Recurrence\n", 66 | "rec = Recurrence(...,\n", 67 | " input_nonsequences = {decoder.encoder_rnn_input: encoder.rnn},\n", 68 | " )\n", 69 | "\n", 70 | "```\n", 71 | "\n", 72 | "For starters, you can take it's last tick (via SliceLayer) inside the decoder step and feed it as input to make sure it works.\n", 73 | "\n", 74 | "__2)__ Implement attention mechanism\n", 75 | "\n", 76 | "Next thing we'll need is to implement the math of attention.\n", 77 | "\n", 78 | "The simplest way to do so is to write a special layer. We gave you a prototype and some tests below.\n", 79 | "\n", 80 | "__3)__ Use attention inside decoder\n", 81 | "\n", 82 | "That's almost it! Now use `AttentionLayer` inside the decoder and feed it to back to lstm/gru/rnn (see code demo below).\n", 83 | "\n", 84 | "Train the full network just like you did before attention.\n", 85 | "\n", 86 | "__More points__ will be awwarded for comparing learning results of attention Vs no attention.\n", 87 | "\n", 88 | "__Bonus bonus:__ visualize attention vectors (>= +3 points)\n", 89 | "\n", 90 | "The best way to make sure your attention actually works is to visualize it.\n", 91 | "\n", 92 | "A simple way to do so is to obtain attention vectors from each tick (values __right after softmax__, not the layer outputs) and drawing those as images.\n", 93 | "\n", 94 | "#### step-by-step guide:\n", 95 | "- split AttentionLayer into two layers: _\"from start to softmax\"_ and _\"from softmax to output\"_\n", 96 | "- add outputs of the first layer to recurrence's `tracked_outputs`\n", 97 | "- compile a function that computes them\n", 98 | "- plt.imshow(them)\n", 99 | "\n", 100 | "\n" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "import numpy as np\n", 112 | "import theano,lasagne\n", 113 | "import theano.tensor as T\n", 114 | "from lasagne import init\n", 115 | "from lasagne.layers import *" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "class AttentionLayer(MergeLayer):\n", 125 | " def __init__(self,decoder_h,encoder_rnn):\n", 126 | " #sanity checks\n", 127 | " assert len(decoder_h.output_shape)==2,\"please feed decoder 1 step activation as first param \"\n", 128 | " assert len(encoder_rnn.output_shape)==3, \"please feed full encoder rnn sequence as second param\"\n", 129 | " \n", 130 | " self.decoder_num_units = decoder_h.output_shape[-1]\n", 131 | " self.encoder_num_units = encoder.output_shape[-1]\n", 132 | "\n", 133 | " #Here you should initialize all trainable parameters.\n", 134 | " #\n", 135 | " \n", 136 | " #use this syntax:\n", 137 | " self.add_param(spec=init.Normal(std=0.01), #or other initializer\n", 138 | " shape=,\n", 139 | " name='')\n", 140 | " \n", 141 | " \n", 142 | " MergeLayer.__init__(self,[decoder_h,encoder_rnn],name=\"attention\")\n", 143 | " \n", 144 | " \n", 145 | " def get_output_shape_for(self,input_shapes,**kwargs):\n", 146 | " \"\"\"return matrix of shape [batch_size, encoder num units]\"\"\"\n", 147 | " return (None,self.encoder_num_units)\n", 148 | " \n", 149 | " def get_output_for(self,inputs,**kwargs):\n", 150 | " \"\"\"\n", 151 | " takes (decoder_h, encoder_seq)\n", 152 | " decoder_h has shape [batch_size, decoder num_units]\n", 153 | " encoder_seq has shape [batch_size, sequence_length, encoder num_units]\n", 154 | " \n", 155 | " returns attention output: matrix of shape [batch_size, encoder num units]\n", 156 | " \n", 157 | " please read comments carefully before you start implementing\n", 158 | " \"\"\"\n", 159 | " decoder_h,encoder_seq = inputs\n", 160 | " \n", 161 | " #get symbolic batch-size / seq length. Also don't forget self.decoder_num_units above\n", 162 | " batch_size,seq_length,_ = tuple(encoder_seq.shape)\n", 163 | " \n", 164 | " #here's a recommended step-by-step guide for attention mechanism. \n", 165 | " #You are free to ignore it alltogether if you so wish\n", 166 | " \n", 167 | " #we repeat decoder activations to allign with encoder\n", 168 | " decoder_h_repeated = \n", 170 | " \n", 171 | " # ^--shape=[batch,seq_length,decoder_n_units]\n", 172 | " \n", 173 | " encoder_and_decoder_together = \n", 174 | " # ^--shape=[batch,seq_length,enc_n_units+dec_n_units]\n", 175 | " \n", 176 | " #here we flatten the tensor to simplify\n", 177 | " encoder_and_decoder_flat = T.reshape(encoder_and_decoder_together,(-1,encoder_and_decoder_together.shape[-1]))\n", 178 | " # ^--shape=[batch*seq_length,enc_n_units+dec_n_units]\n", 179 | " \n", 180 | " #here you use encoder_and_decoder_flat and some learned weights to predict attention logits\n", 181 | " #don't use softmax yet\n", 182 | " \n", 183 | " attention_logits_flat = \n", 184 | " # ^--shape=[batch*seq_length,1]\n", 185 | " \n", 186 | " \n", 187 | " #here we reshape flat logits back into correct form\n", 188 | " assert attention_logits_flat.ndim==2\n", 189 | " attention_logits = attention_logits_flat.reshape((batch_size,seq_length))\n", 190 | " # ^--shape=[batch,seq_length]\n", 191 | " \n", 192 | " #here we apply softmax :)\n", 193 | " attention = T.nnet.softmax(attention_logits)\n", 194 | " # ^--shape=[batch,seq_length]\n", 195 | " \n", 196 | " #here we compute output\n", 197 | " output = (attention[:,:,None]*encoder_seq).sum(axis=1) #sum over seq_length\n", 198 | " # ^--shape=[batch,enc_n_units]\n", 199 | " \n", 200 | " return output\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "#demo code\n", 210 | "\n", 211 | "from numpy.random import randn\n", 212 | "\n", 213 | "dec_h_prev = InputLayer((None,50),T.constant(randn(5,50)),name='decoder h mock')\n", 214 | "\n", 215 | "enc = InputLayer((None,None,32),T.constant(randn(5,20,32)),name='encoder sequence mock')\n", 216 | "\n", 217 | "attention = AttentionLayer(dec_h_prev,enc)\n", 218 | "\n", 219 | "#now you can use attention as additonal input to your decoder\n", 220 | "#LSTMCell(prev_cell,prev_out,input_or_inputs=(usual_input,attention))\n", 221 | "\n", 222 | "\n", 223 | "#sanity check\n", 224 | "demo_output = get_output(attention).eval()\n", 225 | "print 'actual shape:',demo_output.shape\n", 226 | "assert demo_output.shape == (5,32)\n", 227 | "assert np.isfinite(demo_output)\n", 228 | "\n" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [] 239 | } 240 | ], 241 | "metadata": { 242 | "kernelspec": { 243 | "display_name": "Python 2", 244 | "language": "python", 245 | "name": "python2" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 2 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython2", 257 | "version": "2.7.13" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 2 262 | } 263 | -------------------------------------------------------------------------------- /week4/Seminar4.0_recap_approx_qlearning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Approximate q-learning\n", 8 | "\n", 9 | "In this notebook you will teach a lasagne neural network to do Q-learning." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "__Frameworks__ - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "%env THEANO_FLAGS='floatX=float32'\n", 28 | "import os\n", 29 | "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\"))==0:\n", 30 | " !bash ../xvfb start\n", 31 | " %env DISPLAY=:1" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import gym\n", 43 | "import numpy as np, pandas as pd\n", 44 | "import matplotlib.pyplot as plt\n", 45 | "%matplotlib inline" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false, 53 | "scrolled": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "env = gym.make(\"CartPole-v0\")\n", 58 | "env.reset()\n", 59 | "n_actions = env.action_space.n\n", 60 | "state_dim = env.observation_space.shape\n", 61 | "\n", 62 | "plt.imshow(env.render(\"rgb_array\"))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "# Approximate (deep) Q-learning: building the network\n", 70 | "\n", 71 | "In this section we will build and train naive Q-learning with theano/lasagne" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "First step is initializing input variables" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "import theano\n", 90 | "import theano.tensor as T\n", 91 | "\n", 92 | "#create input variables. We'll support multiple states at once\n", 93 | "\n", 94 | "\n", 95 | "current_states = T.matrix(\"states[batch,units]\")\n", 96 | "actions = T.ivector(\"action_ids[batch]\")\n", 97 | "rewards = T.vector(\"rewards[batch]\")\n", 98 | "next_states = T.matrix(\"next states[batch,units]\")\n", 99 | "is_end = T.ivector(\"vector[batch] where 1 means that session just ended\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "import lasagne\n", 111 | "from lasagne.layers import *\n", 112 | "\n", 113 | "#input layer\n", 114 | "l_states = InputLayer((None,)+state_dim)\n", 115 | "\n", 116 | "\n", 117 | "\n", 118 | "\n", 119 | "\n", 120 | "#output layer\n", 121 | "l_qvalues = DenseLayer(,num_units=n_actions,nonlinearity=None)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "#### Predicting Q-values for `current_states`" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "#get q-values for ALL actions in current_states\n", 140 | "predicted_qvalues = get_output(l_qvalues,{l_states:current_states})" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "#compiling agent's \"GetQValues\" function\n", 152 | "get_qvalues = " 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "#select q-values for chosen actions\n", 164 | "predicted_qvalues_for_actions = predicted_qvalues[T.arange(actions.shape[0]),actions]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "#### Loss function and `update`\n", 172 | "Here we write a function similar to `agent.update`." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "#predict q-values for next states\n", 184 | "predicted_next_qvalues = get_output(l_qvalues,{l_states:})\n", 185 | "\n", 186 | "\n", 187 | "#Computing target q-values under \n", 188 | "gamma = 0.99\n", 189 | "target_qvalues_for_actions = \n", 190 | "\n", 191 | "#zero-out q-values at the end\n", 192 | "target_qvalues_for_actions = (1-is_end)*target_qvalues_for_actions\n", 193 | "\n", 194 | "#don't compute gradient over target q-values (consider constant)\n", 195 | "target_qvalues_for_actions = theano.gradient.disconnected_grad(target_qvalues_for_actions)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "\n", 207 | "#mean squared error loss function\n", 208 | "loss = \n" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "#all network weights\n", 220 | "all_weights = get_all_params(l_qvalues,trainable=True)\n", 221 | "\n", 222 | "#network updates. Note the small learning rate (for stability)\n", 223 | "updates = lasagne.updates.sgd(loss,all_weights,learning_rate=1e-4)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "#Training function that resembles agent.update(state,action,reward,next_state) \n", 235 | "#with 1 more argument meaning is_end\n", 236 | "train_step = theano.function([current_states,actions,rewards,next_states,is_end],\n", 237 | " updates=updates)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "### Playing the game" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "epsilon = 0.25 #initial epsilon\n", 256 | "\n", 257 | "def generate_session(t_max=1000):\n", 258 | " \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n", 259 | " \n", 260 | " total_reward = 0\n", 261 | " s = env.reset()\n", 262 | " \n", 263 | " for t in range(t_max):\n", 264 | " \n", 265 | " #get action q-values from the network\n", 266 | " q_values = get_qvalues([s])[0] \n", 267 | " \n", 268 | " a = \n", 269 | " \n", 270 | " new_s,r,done,info = env.step(a)\n", 271 | " \n", 272 | " #train agent one step. Note that we use one-element arrays instead of scalars \n", 273 | " #because that's what function accepts.\n", 274 | " train_step([s],[a],[r],[new_s],[done])\n", 275 | " \n", 276 | " total_reward+=r\n", 277 | " \n", 278 | " s = new_s\n", 279 | " if done: break\n", 280 | " \n", 281 | " return total_reward\n", 282 | " " 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "for i in range(100):\n", 294 | " \n", 295 | " rewards = [generate_session() for _ in range(100)] #generate new sessions\n", 296 | " \n", 297 | " epsilon*=0.95\n", 298 | " \n", 299 | " print (\"mean reward:%.3f\\tepsilon:%.5f\"%(np.mean(rewards),epsilon))\n", 300 | "\n", 301 | " if np.mean(rewards) > 300:\n", 302 | " print (\"You Win!\")\n", 303 | " break\n", 304 | " \n", 305 | " assert epsilon!=0, \"Please explore environment\"" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Video" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "epsilon=0 #Don't forget to reset epsilon back to initial value if you want to go on training" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "collapsed": false 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "#record sessions\n", 335 | "import gym.wrappers\n", 336 | "env = gym.wrappers.Monitor(env,directory=\"videos\",force=True)\n", 337 | "sessions = [generate_session() for _ in range(100)]\n", 338 | "env.close()\n", 339 | "#unwrap \n", 340 | "env = env.env.env\n", 341 | "#upload to gym\n", 342 | "#gym.upload(\"./videos/\",api_key=\"\") #you'll need me later\n", 343 | "\n", 344 | "#Warning! If you keep seeing error that reads something like\"DoubleWrapError\",\n", 345 | "#run env=gym.make(\"CartPole-v0\");env.reset();" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "collapsed": false 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "#show video\n", 357 | "from IPython.display import HTML\n", 358 | "import os\n", 359 | "\n", 360 | "video_names = list(filter(lambda s:s.endswith(\".mp4\"),os.listdir(\"./videos/\")))\n", 361 | "\n", 362 | "HTML(\"\"\"\n", 363 | "\n", 366 | "\"\"\".format(\"./videos/\"+video_names[-1])) #this may or may not be _last_ video. Try other indices" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": true 374 | }, 375 | "outputs": [], 376 | "source": [] 377 | } 378 | ], 379 | "metadata": { 380 | "kernelspec": { 381 | "display_name": "Python 3", 382 | "language": "python", 383 | "name": "python3" 384 | }, 385 | "language_info": { 386 | "codemirror_mode": { 387 | "name": "ipython", 388 | "version": 3 389 | }, 390 | "file_extension": ".py", 391 | "mimetype": "text/x-python", 392 | "name": "python", 393 | "nbconvert_exporter": "python", 394 | "pygments_lexer": "ipython3", 395 | "version": "3.6.0" 396 | } 397 | }, 398 | "nbformat": 4, 399 | "nbformat_minor": 0 400 | } 401 | -------------------------------------------------------------------------------- /week2/assignment/graphicsCrawlerDisplay.py: -------------------------------------------------------------------------------- 1 | # graphicsCrawlerDisplay.py 2 | # ------------------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | import Tkinter 10 | import qlearningAgents 11 | import time 12 | import threading 13 | import sys 14 | import crawler 15 | #import pendulum 16 | import math 17 | from math import pi as PI 18 | 19 | robotType = 'crawler' 20 | 21 | class Application: 22 | 23 | def sigmoid(self, x): 24 | return 1.0 / (1.0 + 2.0 ** (-x)) 25 | 26 | def incrementSpeed(self, inc): 27 | self.tickTime *= inc 28 | # self.epsilon = min(1.0, self.epsilon) 29 | # self.epsilon = max(0.0,self.epsilon) 30 | # self.learner.setSpeed(self.epsilon) 31 | self.speed_label['text'] = 'Step Delay: %.5f' % (self.tickTime) 32 | 33 | def incrementEpsilon(self, inc): 34 | self.ep += inc 35 | self.epsilon = self.sigmoid(self.ep) 36 | self.learner.setEpsilon(self.epsilon) 37 | self.epsilon_label['text'] = 'Epsilon: %.3f' % (self.epsilon) 38 | 39 | def incrementGamma(self, inc): 40 | self.ga += inc 41 | self.gamma = self.sigmoid(self.ga) 42 | self.learner.setDiscount(self.gamma) 43 | self.gamma_label['text'] = 'Discount: %.3f' % (self.gamma) 44 | 45 | def incrementAlpha(self, inc): 46 | self.al += inc 47 | self.alpha = self.sigmoid(self.al) 48 | self.learner.setLearningRate(self.alpha) 49 | self.alpha_label['text'] = 'Learning Rate: %.3f' % (self.alpha) 50 | 51 | def __initGUI(self, win): 52 | ## Window ## 53 | self.win = win 54 | 55 | ## Initialize Frame ## 56 | win.grid() 57 | self.dec = -.5 58 | self.inc = .5 59 | self.tickTime = 0.1 60 | 61 | ## Epsilon Button + Label ## 62 | self.setupSpeedButtonAndLabel(win) 63 | 64 | self.setupEpsilonButtonAndLabel(win) 65 | 66 | ## Gamma Button + Label ## 67 | self.setUpGammaButtonAndLabel(win) 68 | 69 | ## Alpha Button + Label ## 70 | self.setupAlphaButtonAndLabel(win) 71 | 72 | ## Exit Button ## 73 | #self.exit_button = Tkinter.Button(win,text='Quit', command=self.exit) 74 | #self.exit_button.grid(row=0, column=9) 75 | 76 | ## Simulation Buttons ## 77 | # self.setupSimulationButtons(win) 78 | 79 | ## Canvas ## 80 | self.canvas = Tkinter.Canvas(root, height=200, width=1000) 81 | self.canvas.grid(row=2,columnspan=10) 82 | 83 | def setupAlphaButtonAndLabel(self, win): 84 | self.alpha_minus = Tkinter.Button(win, 85 | text="-",command=(lambda: self.incrementAlpha(self.dec))) 86 | self.alpha_minus.grid(row=1, column=3, padx=10) 87 | 88 | self.alpha = self.sigmoid(self.al) 89 | self.alpha_label = Tkinter.Label(win, text='Learning Rate: %.3f' % (self.alpha)) 90 | self.alpha_label.grid(row=1, column=4) 91 | 92 | self.alpha_plus = Tkinter.Button(win, 93 | text="+",command=(lambda: self.incrementAlpha(self.inc))) 94 | self.alpha_plus.grid(row=1, column=5, padx=10) 95 | 96 | def setUpGammaButtonAndLabel(self, win): 97 | self.gamma_minus = Tkinter.Button(win, 98 | text="-",command=(lambda: self.incrementGamma(self.dec))) 99 | self.gamma_minus.grid(row=1, column=0, padx=10) 100 | 101 | self.gamma = self.sigmoid(self.ga) 102 | self.gamma_label = Tkinter.Label(win, text='Discount: %.3f' % (self.gamma)) 103 | self.gamma_label.grid(row=1, column=1) 104 | 105 | self.gamma_plus = Tkinter.Button(win, 106 | text="+",command=(lambda: self.incrementGamma(self.inc))) 107 | self.gamma_plus.grid(row=1, column=2, padx=10) 108 | 109 | def setupEpsilonButtonAndLabel(self, win): 110 | self.epsilon_minus = Tkinter.Button(win, 111 | text="-",command=(lambda: self.incrementEpsilon(self.dec))) 112 | self.epsilon_minus.grid(row=0, column=3) 113 | 114 | self.epsilon = self.sigmoid(self.ep) 115 | self.epsilon_label = Tkinter.Label(win, text='Epsilon: %.3f' % (self.epsilon)) 116 | self.epsilon_label.grid(row=0, column=4) 117 | 118 | self.epsilon_plus = Tkinter.Button(win, 119 | text="+",command=(lambda: self.incrementEpsilon(self.inc))) 120 | self.epsilon_plus.grid(row=0, column=5) 121 | 122 | def setupSpeedButtonAndLabel(self, win): 123 | self.speed_minus = Tkinter.Button(win, 124 | text="-",command=(lambda: self.incrementSpeed(.5))) 125 | self.speed_minus.grid(row=0, column=0) 126 | 127 | self.speed_label = Tkinter.Label(win, text='Step Delay: %.5f' % (self.tickTime)) 128 | self.speed_label.grid(row=0, column=1) 129 | 130 | self.speed_plus = Tkinter.Button(win, 131 | text="+",command=(lambda: self.incrementSpeed(2))) 132 | self.speed_plus.grid(row=0, column=2) 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | def skip5kSteps(self): 141 | self.stepsToSkip = 5000 142 | 143 | def __init__(self, win): 144 | 145 | self.ep = 0 146 | self.ga = 2 147 | self.al = 2 148 | self.stepCount = 0 149 | ## Init Gui 150 | 151 | self.__initGUI(win) 152 | 153 | # Init environment 154 | if robotType == 'crawler': 155 | self.robot = crawler.CrawlingRobot(self.canvas) 156 | self.robotEnvironment = crawler.CrawlingRobotEnvironment(self.robot) 157 | elif robotType == 'pendulum': 158 | self.robot = pendulum.PendulumRobot(self.canvas) 159 | self.robotEnvironment = \ 160 | pendulum.PendulumRobotEnvironment(self.robot) 161 | else: 162 | raise "Unknown RobotType" 163 | 164 | # Init Agent 165 | simulationFn = lambda agent: \ 166 | simulation.SimulationEnvironment(self.robotEnvironment,agent) 167 | actionFn = lambda state: \ 168 | self.robotEnvironment.getPossibleActions(state) 169 | self.learner = qlearningAgents.QLearningAgent(actionFn=actionFn) 170 | 171 | self.learner.setEpsilon(self.epsilon) 172 | self.learner.setLearningRate(self.alpha) 173 | self.learner.setDiscount(self.gamma) 174 | 175 | # Start GUI 176 | self.running = True 177 | self.stopped = False 178 | self.stepsToSkip = 0 179 | self.thread = threading.Thread(target=self.run) 180 | self.thread.start() 181 | 182 | 183 | def exit(self): 184 | self.running = False 185 | for i in range(5): 186 | if not self.stopped: 187 | # print "Waiting for thread to die..." 188 | time.sleep(0.1) 189 | self.win.destroy() 190 | sys.exit(0) 191 | 192 | def step(self): 193 | 194 | self.stepCount += 1 195 | 196 | state = self.robotEnvironment.getCurrentState() 197 | actions = self.robotEnvironment.getPossibleActions(state) 198 | if len(actions) == 0.0: 199 | self.robotEnvironment.reset() 200 | state = self.robotEnvironment.getCurrentState() 201 | actions = self.robotEnvironment.getPossibleActions(state) 202 | print 'Reset!' 203 | action = self.learner.getAction(state) 204 | if action == None: 205 | raise 'None action returned: Code Not Complete' 206 | nextState, reward = self.robotEnvironment.doAction(action) 207 | self.learner.observeTransition(state, action, nextState, reward) 208 | 209 | def animatePolicy(self): 210 | if robotType != 'pendulum': 211 | raise 'Only pendulum can animatePolicy' 212 | 213 | 214 | totWidth = self.canvas.winfo_reqwidth() 215 | totHeight = self.canvas.winfo_reqheight() 216 | 217 | length = 0.48 * min(totWidth, totHeight) 218 | x,y = totWidth-length-30, length+10 219 | 220 | 221 | 222 | angleMin, angleMax = self.robot.getMinAndMaxAngle() 223 | velMin, velMax = self.robot.getMinAndMaxAngleVelocity() 224 | 225 | if not 'animatePolicyBox' in dir(self): 226 | self.canvas.create_line(x,y,x+length,y) 227 | self.canvas.create_line(x+length,y,x+length,y-length) 228 | self.canvas.create_line(x+length,y-length,x,y-length) 229 | self.canvas.create_line(x,y-length,x,y) 230 | self.animatePolicyBox = 1 231 | self.canvas.create_text(x+length/2,y+10,text='angle') 232 | self.canvas.create_text(x-30,y-length/2,text='velocity') 233 | self.canvas.create_text(x-60,y-length/4,text='Blue = kickLeft') 234 | self.canvas.create_text(x-60,y-length/4+20,text='Red = kickRight') 235 | self.canvas.create_text(x-60,y-length/4+40,text='White = doNothing') 236 | 237 | 238 | 239 | angleDelta = (angleMax-angleMin) / 100 240 | velDelta = (velMax-velMin) / 100 241 | for i in range(100): 242 | angle = angleMin + i * angleDelta 243 | 244 | for j in range(100): 245 | vel = velMin + j * velDelta 246 | state = self.robotEnvironment.getState(angle,vel) 247 | max, argMax = None, None 248 | if not self.learner.seenState(state): 249 | argMax = 'unseen' 250 | else: 251 | for action in ('kickLeft','kickRight','doNothing'): 252 | qVal = self.learner.getQValue(state, action) 253 | if max == None or qVal > max: 254 | max, argMax = qVal, action 255 | if argMax != 'unseen': 256 | if argMax == 'kickLeft': 257 | color = 'blue' 258 | elif argMax == 'kickRight': 259 | color = 'red' 260 | elif argMax == 'doNothing': 261 | color = 'white' 262 | dx = length / 100.0 263 | dy = length / 100.0 264 | x0, y0 = x+i*dx, y-j*dy 265 | self.canvas.create_rectangle(x0,y0,x0+dx,y0+dy,fill=color) 266 | 267 | 268 | 269 | 270 | def run(self): 271 | self.stepCount = 0 272 | self.learner.startEpisode() 273 | while True: 274 | minSleep = .01 275 | tm = max(minSleep, self.tickTime) 276 | time.sleep(tm) 277 | self.stepsToSkip = int(tm / self.tickTime) - 1 278 | 279 | if not self.running: 280 | self.stopped = True 281 | return 282 | for i in range(self.stepsToSkip): 283 | self.step() 284 | self.stepsToSkip = 0 285 | self.step() 286 | # self.robot.draw() 287 | self.learner.stopEpisode() 288 | 289 | def start(self): 290 | self.win.mainloop() 291 | 292 | 293 | 294 | 295 | 296 | def run(): 297 | global root 298 | root = Tkinter.Tk() 299 | root.title( 'Crawler GUI' ) 300 | root.resizable( 0, 0 ) 301 | 302 | # root.mainloop() 303 | 304 | 305 | app = Application(root) 306 | def update_gui(): 307 | app.robot.draw(app.stepCount, app.tickTime) 308 | root.after(10, update_gui) 309 | update_gui() 310 | 311 | root.protocol( 'WM_DELETE_WINDOW', app.exit) 312 | app.start() 313 | 314 | -------------------------------------------------------------------------------- /week3.5/Seminar3.5-approx-qlearning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Approximate q-learning\n", 8 | "\n", 9 | "In this notebook you will teach a lasagne neural network to do Q-learning." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "__Frameworks__ - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "#XVFB will be launched if you run on a server\n", 28 | "import os\n", 29 | "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\"))==0:\n", 30 | " !bash ../xvfb start\n", 31 | " %env DISPLAY=:1" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import gym\n", 43 | "import numpy as np, pandas as pd\n", 44 | "import matplotlib.pyplot as plt\n", 45 | "%matplotlib inline" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false, 53 | "scrolled": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "env = gym.make(\"CartPole-v0\")\n", 58 | "env.reset()\n", 59 | "n_actions = env.action_space.n\n", 60 | "state_dim = env.observation_space.shape\n", 61 | "\n", 62 | "plt.imshow(env.render(\"rgb_array\"))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "# Approximate (deep) Q-learning: building the network\n", 70 | "\n", 71 | "In this section we will build and train naive Q-learning with theano/lasagne" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "First step is initializing input variables" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "import theano\n", 90 | "import theano.tensor as T\n", 91 | "\n", 92 | "#create input variables. We'll support multiple states at once\n", 93 | "\n", 94 | "\n", 95 | "current_states = T.matrix(\"states[batch,units]\")\n", 96 | "actions = T.ivector(\"action_ids[batch]\")\n", 97 | "rewards = T.vector(\"rewards[batch]\")\n", 98 | "next_states = T.matrix(\"next states[batch,units]\")\n", 99 | "is_end = T.ivector(\"vector[batch] where 1 means that session just ended\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "import lasagne\n", 111 | "from lasagne.layers import *\n", 112 | "\n", 113 | "#input layer\n", 114 | "l_states = InputLayer((None,)+state_dim)\n", 115 | "\n", 116 | "\n", 117 | "\n", 118 | "\n", 119 | "\n", 120 | "#output layer\n", 121 | "l_qvalues = DenseLayer(,num_units=n_actions,nonlinearity=None)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "#### Predicting Q-values for `current_states`" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "#get q-values for ALL actions in current_states\n", 140 | "predicted_qvalues = get_output(l_qvalues,{l_states:current_states})" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "#compiling agent's \"GetQValues\" function\n", 152 | "get_qvalues = " 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "#select q-values for chosen actions\n", 164 | "predicted_qvalues_for_actions = predicted_qvalues[T.arange(actions.shape[0]),actions]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "#### Loss function and `update`\n", 172 | "Here we write a function similar to `agent.update`." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "#predict q-values for next states\n", 184 | "predicted_next_qvalues = get_output(l_qvalues,{l_states:})\n", 185 | "\n", 186 | "\n", 187 | "#Computing target q-values under \n", 188 | "gamma = 0.99\n", 189 | "target_qvalues_for_actions = \n", 190 | "\n", 191 | "#zero-out q-values at the end\n", 192 | "target_qvalues_for_actions = (1-is_end)*target_qvalues_for_actions\n", 193 | "\n", 194 | "#don't compute gradient over target q-values (consider constant)\n", 195 | "target_qvalues_for_actions = theano.gradient.disconnected_grad(target_qvalues_for_actions)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "\n", 207 | "#mean squared error loss function\n", 208 | "loss = \n" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "#all network weights\n", 220 | "all_weights = get_all_params(l_qvalues,trainable=True)\n", 221 | "\n", 222 | "#network updates. Note the small learning rate (for stability)\n", 223 | "updates = lasagne.updates.sgd(loss,all_weights,learning_rate=1e-4)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "#Training function that resembles agent.update(state,action,reward,next_state) \n", 235 | "#with 1 more argument meaning is_end\n", 236 | "train_step = theano.function([current_states,actions,rewards,next_states,is_end],\n", 237 | " updates=updates)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "### Playing the game" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "epsilon = 0.25 #initial epsilon\n", 256 | "\n", 257 | "def generate_session(t_max=1000):\n", 258 | " \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n", 259 | " \n", 260 | " total_reward = 0\n", 261 | " s = env.reset()\n", 262 | " \n", 263 | " for t in range(t_max):\n", 264 | " \n", 265 | " #get action q-values from the network\n", 266 | " q_values = get_qvalues([s])[0] \n", 267 | " \n", 268 | " a = \n", 269 | " \n", 270 | " new_s,r,done,info = env.step(a)\n", 271 | " \n", 272 | " #train agent one step. Note that we use one-element arrays instead of scalars \n", 273 | " #because that's what function accepts.\n", 274 | " train_step([s],[a],[r],[new_s],[done])\n", 275 | " \n", 276 | " total_reward+=r\n", 277 | " \n", 278 | " s = new_s\n", 279 | " if done: break\n", 280 | " \n", 281 | " return total_reward\n", 282 | " " 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "for i in range(100):\n", 294 | " \n", 295 | " rewards = [generate_session() for _ in range(100)] #generate new sessions\n", 296 | " \n", 297 | " epsilon*=0.95\n", 298 | " \n", 299 | " print (\"mean reward:%.3f\\tepsilon:%.5f\"%(np.mean(rewards),epsilon))\n", 300 | "\n", 301 | " if np.mean(rewards) > 300:\n", 302 | " print (\"You Win!\")\n", 303 | " break\n", 304 | " \n", 305 | " assert epsilon!=0, \"Please explore environment\"" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Video" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "epsilon=0 #Don't forget to reset epsilon back to initial value if you want to go on training" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "collapsed": false 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "#record sessions\n", 335 | "import gym.wrappers\n", 336 | "env = gym.wrappers.Monitor(env,directory=\"videos\",force=True)\n", 337 | "sessions = [generate_session() for _ in range(100)]\n", 338 | "env.close()\n", 339 | "#unwrap \n", 340 | "env = env.env.env\n", 341 | "#upload to gym\n", 342 | "#gym.upload(\"./videos/\",api_key=\"\") #you'll need me later\n", 343 | "\n", 344 | "#Warning! If you keep seeing error that reads something like\"DoubleWrapError\",\n", 345 | "#run env=gym.make(\"CartPole-v0\");env.reset();" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "collapsed": false 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "#show video\n", 357 | "from IPython.display import HTML\n", 358 | "import os\n", 359 | "\n", 360 | "video_names = list(filter(lambda s:s.endswith(\".mp4\"),os.listdir(\"./videos/\")))\n", 361 | "\n", 362 | "HTML(\"\"\"\n", 363 | "\n", 366 | "\"\"\".format(\"./videos/\"+video_names[-1])) #this may or may not be _last_ video. Try other indices" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "collapsed": true 373 | }, 374 | "source": [ 375 | "### Homework\n", 376 | "\n", 377 | "Two paths lie ahead of you, and which one to take is a rightfull choice of yours.\n", 378 | "\n", 379 | "* __[recommended]__ Go deeper. Return to seminar1 and get 99% accuracy on MNIST\n", 380 | "* __[alternative]__ Try approximate expected-value SARSA and other algorithms and compare it with q-learning \n", 381 | " * +3 points for EV-SARSA and comparison to Q-learning\n", 382 | " * +2 per additional algorithm\n", 383 | "* __[alternative hard]__ Pick `````` and solve it, using NN.\n", 384 | " * LunarLander, MountainCar or Breakout (from week1 bonus)\n", 385 | " * LunarLander should get at least +100\n", 386 | " * MountainCar should get at least -200\n", 387 | " * You will need to somehow stabilize learning\n", 388 | " \n" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": { 395 | "collapsed": true 396 | }, 397 | "outputs": [], 398 | "source": [] 399 | } 400 | ], 401 | "metadata": { 402 | "kernelspec": { 403 | "display_name": "Python [Root]", 404 | "language": "python", 405 | "name": "Python [Root]" 406 | }, 407 | "language_info": { 408 | "codemirror_mode": { 409 | "name": "ipython", 410 | "version": 2 411 | }, 412 | "file_extension": ".py", 413 | "mimetype": "text/x-python", 414 | "name": "python", 415 | "nbconvert_exporter": "python", 416 | "pygments_lexer": "ipython2", 417 | "version": "2.7.12" 418 | } 419 | }, 420 | "nbformat": 4, 421 | "nbformat_minor": 0 422 | } 423 | -------------------------------------------------------------------------------- /week2/assignment/graphicsUtils.py: -------------------------------------------------------------------------------- 1 | # graphicsUtils.py 2 | # ---------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | import sys 10 | import math 11 | import random 12 | import string 13 | import time 14 | import types 15 | import Tkinter 16 | 17 | _Windows = sys.platform == 'win32' # True if on Win95/98/NT 18 | 19 | _root_window = None # The root window for graphics output 20 | _canvas = None # The canvas which holds graphics 21 | _canvas_xs = None # Size of canvas object 22 | _canvas_ys = None 23 | _canvas_x = None # Current position on canvas 24 | _canvas_y = None 25 | _canvas_col = None # Current colour (set to black below) 26 | _canvas_tsize = 12 27 | _canvas_tserifs = 0 28 | 29 | def formatColor(r, g, b): 30 | return '#%02x%02x%02x' % (int(r * 255), int(g * 255), int(b * 255)) 31 | 32 | def colorToVector(color): 33 | return map(lambda x: int(x, 16) / 256.0, [color[1:3], color[3:5], color[5:7]]) 34 | 35 | if _Windows: 36 | _canvas_tfonts = ['times new roman', 'lucida console'] 37 | else: 38 | _canvas_tfonts = ['times', 'lucidasans-24'] 39 | pass # XXX need defaults here 40 | 41 | def sleep(secs): 42 | global _root_window 43 | if _root_window == None: 44 | time.sleep(secs) 45 | else: 46 | _root_window.update_idletasks() 47 | _root_window.after(int(1000 * secs), _root_window.quit) 48 | _root_window.mainloop() 49 | 50 | def begin_graphics(width=640, height=480, color=formatColor(0, 0, 0), title=None): 51 | 52 | global _root_window, _canvas, _canvas_x, _canvas_y, _canvas_xs, _canvas_ys, _bg_color 53 | 54 | # Check for duplicate call 55 | if _root_window is not None: 56 | # Lose the window. 57 | _root_window.destroy() 58 | 59 | # Save the canvas size parameters 60 | _canvas_xs, _canvas_ys = width - 1, height - 1 61 | _canvas_x, _canvas_y = 0, _canvas_ys 62 | _bg_color = color 63 | 64 | # Create the root window 65 | _root_window = Tkinter.Tk() 66 | _root_window.protocol('WM_DELETE_WINDOW', _destroy_window) 67 | _root_window.title(title or 'Graphics Window') 68 | _root_window.resizable(0, 0) 69 | 70 | # Create the canvas object 71 | try: 72 | _canvas = Tkinter.Canvas(_root_window, width=width, height=height) 73 | _canvas.pack() 74 | draw_background() 75 | _canvas.update() 76 | except: 77 | _root_window = None 78 | raise 79 | 80 | # Bind to key-down and key-up events 81 | _root_window.bind( "", _keypress ) 82 | _root_window.bind( "", _keyrelease ) 83 | _root_window.bind( "", _clear_keys ) 84 | _root_window.bind( "", _clear_keys ) 85 | _root_window.bind( "", _leftclick ) 86 | _root_window.bind( "", _rightclick ) 87 | _root_window.bind( "", _rightclick ) 88 | _root_window.bind( "", _ctrl_leftclick) 89 | _clear_keys() 90 | 91 | _leftclick_loc = None 92 | _rightclick_loc = None 93 | _ctrl_leftclick_loc = None 94 | 95 | def _leftclick(event): 96 | global _leftclick_loc 97 | _leftclick_loc = (event.x, event.y) 98 | 99 | def _rightclick(event): 100 | global _rightclick_loc 101 | _rightclick_loc = (event.x, event.y) 102 | 103 | def _ctrl_leftclick(event): 104 | global _ctrl_leftclick_loc 105 | _ctrl_leftclick_loc = (event.x, event.y) 106 | 107 | def wait_for_click(): 108 | while True: 109 | global _leftclick_loc 110 | global _rightclick_loc 111 | global _ctrl_leftclick_loc 112 | if _leftclick_loc != None: 113 | val = _leftclick_loc 114 | _leftclick_loc = None 115 | return val, 'left' 116 | if _rightclick_loc != None: 117 | val = _rightclick_loc 118 | _rightclick_loc = None 119 | return val, 'right' 120 | if _ctrl_leftclick_loc != None: 121 | val = _ctrl_leftclick_loc 122 | _ctrl_leftclick_loc = None 123 | return val, 'ctrl_left' 124 | sleep(0.05) 125 | 126 | def draw_background(): 127 | corners = [(0,0), (0, _canvas_ys), (_canvas_xs, _canvas_ys), (_canvas_xs, 0)] 128 | polygon(corners, _bg_color, fillColor=_bg_color, filled=True, smoothed=False) 129 | 130 | def _destroy_window(event=None): 131 | sys.exit(0) 132 | # global _root_window 133 | # _root_window.destroy() 134 | # _root_window = None 135 | #print "DESTROY" 136 | 137 | def end_graphics(): 138 | global _root_window, _canvas, _mouse_enabled 139 | try: 140 | try: 141 | sleep(1) 142 | if _root_window != None: 143 | _root_window.destroy() 144 | except SystemExit, e: 145 | print 'Ending graphics raised an exception:', e 146 | finally: 147 | _root_window = None 148 | _canvas = None 149 | _mouse_enabled = 0 150 | _clear_keys() 151 | 152 | def clear_screen(background=None): 153 | global _canvas_x, _canvas_y 154 | _canvas.delete('all') 155 | draw_background() 156 | _canvas_x, _canvas_y = 0, _canvas_ys 157 | 158 | def polygon(coords, outlineColor, fillColor=None, filled=1, smoothed=1, behind=0, width=1): 159 | c = [] 160 | for coord in coords: 161 | c.append(coord[0]) 162 | c.append(coord[1]) 163 | if fillColor == None: fillColor = outlineColor 164 | if filled == 0: fillColor = "" 165 | poly = _canvas.create_polygon(c, outline=outlineColor, fill=fillColor, smooth=smoothed, width=width) 166 | if behind > 0: 167 | _canvas.tag_lower(poly, behind) # Higher should be more visible 168 | return poly 169 | 170 | def square(pos, r, color, filled=1, behind=0): 171 | x, y = pos 172 | coords = [(x - r, y - r), (x + r, y - r), (x + r, y + r), (x - r, y + r)] 173 | return polygon(coords, color, color, filled, 0, behind=behind) 174 | 175 | def circle(pos, r, outlineColor, fillColor, endpoints=None, style='pieslice', width=2): 176 | x, y = pos 177 | x0, x1 = x - r - 1, x + r 178 | y0, y1 = y - r - 1, y + r 179 | if endpoints == None: 180 | e = [0, 359] 181 | else: 182 | e = list(endpoints) 183 | while e[0] > e[1]: e[1] = e[1] + 360 184 | 185 | return _canvas.create_arc(x0, y0, x1, y1, outline=outlineColor, fill=fillColor, 186 | extent=e[1] - e[0], start=e[0], style=style, width=width) 187 | 188 | def image(pos, file="../../blueghost.gif"): 189 | x, y = pos 190 | # img = PhotoImage(file=file) 191 | return _canvas.create_image(x, y, image = Tkinter.PhotoImage(file=file), anchor = Tkinter.NW) 192 | 193 | 194 | def refresh(): 195 | _canvas.update_idletasks() 196 | 197 | def moveCircle(id, pos, r, endpoints=None): 198 | global _canvas_x, _canvas_y 199 | 200 | x, y = pos 201 | # x0, x1 = x - r, x + r + 1 202 | # y0, y1 = y - r, y + r + 1 203 | x0, x1 = x - r - 1, x + r 204 | y0, y1 = y - r - 1, y + r 205 | if endpoints == None: 206 | e = [0, 359] 207 | else: 208 | e = list(endpoints) 209 | while e[0] > e[1]: e[1] = e[1] + 360 210 | 211 | edit(id, ('start', e[0]), ('extent', e[1] - e[0])) 212 | move_to(id, x0, y0) 213 | 214 | def edit(id, *args): 215 | _canvas.itemconfigure(id, **dict(args)) 216 | 217 | def text(pos, color, contents, font='Helvetica', size=12, style='normal', anchor="nw"): 218 | global _canvas_x, _canvas_y 219 | x, y = pos 220 | font = (font, str(size), style) 221 | return _canvas.create_text(x, y, fill=color, text=contents, font=font, anchor=anchor) 222 | 223 | def changeText(id, newText, font=None, size=12, style='normal'): 224 | _canvas.itemconfigure(id, text=newText) 225 | if font != None: 226 | _canvas.itemconfigure(id, font=(font, '-%d' % size, style)) 227 | 228 | def changeColor(id, newColor): 229 | _canvas.itemconfigure(id, fill=newColor) 230 | 231 | def line(here, there, color=formatColor(0, 0, 0), width=2): 232 | x0, y0 = here[0], here[1] 233 | x1, y1 = there[0], there[1] 234 | return _canvas.create_line(x0, y0, x1, y1, fill=color, width=width) 235 | 236 | ############################################################################## 237 | ### Keypress handling ######################################################## 238 | ############################################################################## 239 | 240 | # We bind to key-down and key-up events. 241 | 242 | _keysdown = {} 243 | _keyswaiting = {} 244 | # This holds an unprocessed key release. We delay key releases by up to 245 | # one call to keys_pressed() to get round a problem with auto repeat. 246 | _got_release = None 247 | 248 | def _keypress(event): 249 | global _got_release 250 | #remap_arrows(event) 251 | _keysdown[event.keysym] = 1 252 | _keyswaiting[event.keysym] = 1 253 | # print event.char, event.keycode 254 | _got_release = None 255 | 256 | def _keyrelease(event): 257 | global _got_release 258 | #remap_arrows(event) 259 | try: 260 | del _keysdown[event.keysym] 261 | except: 262 | pass 263 | _got_release = 1 264 | 265 | def remap_arrows(event): 266 | # TURN ARROW PRESSES INTO LETTERS (SHOULD BE IN KEYBOARD AGENT) 267 | if event.char in ['a', 's', 'd', 'w']: 268 | return 269 | if event.keycode in [37, 101]: # LEFT ARROW (win / x) 270 | event.char = 'a' 271 | if event.keycode in [38, 99]: # UP ARROW 272 | event.char = 'w' 273 | if event.keycode in [39, 102]: # RIGHT ARROW 274 | event.char = 'd' 275 | if event.keycode in [40, 104]: # DOWN ARROW 276 | event.char = 's' 277 | 278 | def _clear_keys(event=None): 279 | global _keysdown, _got_release, _keyswaiting 280 | _keysdown = {} 281 | _keyswaiting = {} 282 | _got_release = None 283 | 284 | def keys_pressed(d_o_e=Tkinter.tkinter.dooneevent, 285 | d_w=Tkinter.tkinter.DONT_WAIT): 286 | d_o_e(d_w) 287 | if _got_release: 288 | d_o_e(d_w) 289 | return _keysdown.keys() 290 | 291 | def keys_waiting(): 292 | global _keyswaiting 293 | keys = _keyswaiting.keys() 294 | _keyswaiting = {} 295 | return keys 296 | 297 | # Block for a list of keys... 298 | 299 | def wait_for_keys(): 300 | keys = [] 301 | while keys == []: 302 | keys = keys_pressed() 303 | sleep(0.05) 304 | return keys 305 | 306 | def remove_from_screen(x, 307 | d_o_e=Tkinter.tkinter.dooneevent, 308 | d_w=Tkinter.tkinter.DONT_WAIT): 309 | _canvas.delete(x) 310 | d_o_e(d_w) 311 | 312 | def _adjust_coords(coord_list, x, y): 313 | for i in range(0, len(coord_list), 2): 314 | coord_list[i] = coord_list[i] + x 315 | coord_list[i + 1] = coord_list[i + 1] + y 316 | return coord_list 317 | 318 | def move_to(object, x, y=None, 319 | d_o_e=Tkinter.tkinter.dooneevent, 320 | d_w=Tkinter.tkinter.DONT_WAIT): 321 | if y is None: 322 | try: x, y = x 323 | except: raise 'incomprehensible coordinates' 324 | 325 | horiz = True 326 | newCoords = [] 327 | current_x, current_y = _canvas.coords(object)[0:2] # first point 328 | for coord in _canvas.coords(object): 329 | if horiz: 330 | inc = x - current_x 331 | else: 332 | inc = y - current_y 333 | horiz = not horiz 334 | 335 | newCoords.append(coord + inc) 336 | 337 | _canvas.coords(object, *newCoords) 338 | d_o_e(d_w) 339 | 340 | def move_by(object, x, y=None, 341 | d_o_e=Tkinter.tkinter.dooneevent, 342 | d_w=Tkinter.tkinter.DONT_WAIT): 343 | if y is None: 344 | try: x, y = x 345 | except: raise Exception, 'incomprehensible coordinates' 346 | 347 | horiz = True 348 | newCoords = [] 349 | for coord in _canvas.coords(object): 350 | if horiz: 351 | inc = x 352 | else: 353 | inc = y 354 | horiz = not horiz 355 | 356 | newCoords.append(coord + inc) 357 | 358 | _canvas.coords(object, *newCoords) 359 | d_o_e(d_w) 360 | 361 | def writePostscript(filename): 362 | "Writes the current canvas to a postscript file." 363 | psfile = file(filename, 'w') 364 | psfile.write(_canvas.postscript(pageanchor='sw', 365 | y='0.c', 366 | x='0.c')) 367 | psfile.close() 368 | 369 | ghost_shape = [ 370 | (0, - 0.5), 371 | (0.25, - 0.75), 372 | (0.5, - 0.5), 373 | (0.75, - 0.75), 374 | (0.75, 0.5), 375 | (0.5, 0.75), 376 | (- 0.5, 0.75), 377 | (- 0.75, 0.5), 378 | (- 0.75, - 0.75), 379 | (- 0.5, - 0.5), 380 | (- 0.25, - 0.75) 381 | ] 382 | 383 | if __name__ == '__main__': 384 | begin_graphics() 385 | clear_screen() 386 | ghost_shape = [(x * 10 + 20, y * 10 + 20) for x, y in ghost_shape] 387 | g = polygon(ghost_shape, formatColor(1, 1, 1)) 388 | move_to(g, (50, 50)) 389 | circle((150, 150), 20, formatColor(0.7, 0.3, 0.0), endpoints=[15, - 15]) 390 | sleep(2) -------------------------------------------------------------------------------- /week2/assignment/textGridworldDisplay.py: -------------------------------------------------------------------------------- 1 | # textGridworldDisplay.py 2 | # ----------------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | import util 10 | 11 | class TextGridworldDisplay: 12 | 13 | def __init__(self, gridworld): 14 | self.gridworld = gridworld 15 | 16 | def start(self): 17 | pass 18 | 19 | def pause(self): 20 | pass 21 | 22 | def displayValues(self, agent, currentState = None, message = None): 23 | if message != None: 24 | print message 25 | values = util.Counter() 26 | policy = {} 27 | states = self.gridworld.getStates() 28 | for state in states: 29 | values[state] = agent.getValue(state) 30 | policy[state] = agent.getPolicy(state) 31 | prettyPrintValues(self.gridworld, values, policy, currentState) 32 | 33 | def displayNullValues(self, agent, currentState = None, message = None): 34 | if message != None: print message 35 | prettyPrintNullValues(self.gridworld, currentState) 36 | 37 | def displayQValues(self, agent, currentState = None, message = None): 38 | if message != None: print message 39 | qValues = util.Counter() 40 | states = self.gridworld.getStates() 41 | for state in states: 42 | for action in self.gridworld.getPossibleActions(state): 43 | qValues[(state, action)] = agent.getQValue(state, action) 44 | prettyPrintQValues(self.gridworld, qValues, currentState) 45 | 46 | 47 | def prettyPrintValues(gridWorld, values, policy=None, currentState = None): 48 | grid = gridWorld.grid 49 | maxLen = 11 50 | newRows = [] 51 | for y in range(grid.height): 52 | newRow = [] 53 | for x in range(grid.width): 54 | state = (x, y) 55 | value = values[state] 56 | action = None 57 | if policy != None and state in policy: 58 | action = policy[state] 59 | actions = gridWorld.getPossibleActions(state) 60 | if action not in actions and 'exit' in actions: 61 | action = 'exit' 62 | valString = None 63 | if action == 'exit': 64 | valString = border('%.2f' % value) 65 | else: 66 | valString = '\n\n%.2f\n\n' % value 67 | valString += ' '*maxLen 68 | if grid[x][y] == 'S': 69 | valString = '\n\nS: %.2f\n\n' % value 70 | valString += ' '*maxLen 71 | if grid[x][y] == '#': 72 | valString = '\n#####\n#####\n#####\n' 73 | valString += ' '*maxLen 74 | pieces = [valString] 75 | text = ("\n".join(pieces)).split('\n') 76 | if currentState == state: 77 | l = len(text[1]) 78 | if l == 0: 79 | text[1] = '*' 80 | else: 81 | text[1] = "|" + ' ' * int((l-1)/2-1) + '*' + ' ' * int((l)/2-1) + "|" 82 | if action == 'east': 83 | text[2] = ' ' + text[2] + ' >' 84 | elif action == 'west': 85 | text[2] = '< ' + text[2] + ' ' 86 | elif action == 'north': 87 | text[0] = ' ' * int(maxLen/2) + '^' +' ' * int(maxLen/2) 88 | elif action == 'south': 89 | text[4] = ' ' * int(maxLen/2) + 'v' +' ' * int(maxLen/2) 90 | newCell = "\n".join(text) 91 | newRow.append(newCell) 92 | newRows.append(newRow) 93 | numCols = grid.width 94 | for rowNum, row in enumerate(newRows): 95 | row.insert(0,"\n\n"+str(rowNum)) 96 | newRows.reverse() 97 | colLabels = [str(colNum) for colNum in range(numCols)] 98 | colLabels.insert(0,' ') 99 | finalRows = [colLabels] + newRows 100 | print indent(finalRows,separateRows=True,delim='|', prefix='|',postfix='|', justify='center',hasHeader=True) 101 | 102 | 103 | def prettyPrintNullValues(gridWorld, currentState = None): 104 | grid = gridWorld.grid 105 | maxLen = 11 106 | newRows = [] 107 | for y in range(grid.height): 108 | newRow = [] 109 | for x in range(grid.width): 110 | state = (x, y) 111 | 112 | # value = values[state] 113 | 114 | action = None 115 | # if policy != None and state in policy: 116 | # action = policy[state] 117 | # 118 | actions = gridWorld.getPossibleActions(state) 119 | 120 | if action not in actions and 'exit' in actions: 121 | action = 'exit' 122 | 123 | valString = None 124 | # if action == 'exit': 125 | # valString = border('%.2f' % value) 126 | # else: 127 | # valString = '\n\n%.2f\n\n' % value 128 | # valString += ' '*maxLen 129 | 130 | if grid[x][y] == 'S': 131 | valString = '\n\nS\n\n' 132 | valString += ' '*maxLen 133 | elif grid[x][y] == '#': 134 | valString = '\n#####\n#####\n#####\n' 135 | valString += ' '*maxLen 136 | elif type(grid[x][y]) == float or type(grid[x][y]) == int: 137 | valString = border('%.2f' % float(grid[x][y])) 138 | else: valString = border(' ') 139 | pieces = [valString] 140 | 141 | text = ("\n".join(pieces)).split('\n') 142 | 143 | if currentState == state: 144 | l = len(text[1]) 145 | if l == 0: 146 | text[1] = '*' 147 | else: 148 | text[1] = "|" + ' ' * int((l-1)/2-1) + '*' + ' ' * int((l)/2-1) + "|" 149 | 150 | if action == 'east': 151 | text[2] = ' ' + text[2] + ' >' 152 | elif action == 'west': 153 | text[2] = '< ' + text[2] + ' ' 154 | elif action == 'north': 155 | text[0] = ' ' * int(maxLen/2) + '^' +' ' * int(maxLen/2) 156 | elif action == 'south': 157 | text[4] = ' ' * int(maxLen/2) + 'v' +' ' * int(maxLen/2) 158 | newCell = "\n".join(text) 159 | newRow.append(newCell) 160 | newRows.append(newRow) 161 | numCols = grid.width 162 | for rowNum, row in enumerate(newRows): 163 | row.insert(0,"\n\n"+str(rowNum)) 164 | newRows.reverse() 165 | colLabels = [str(colNum) for colNum in range(numCols)] 166 | colLabels.insert(0,' ') 167 | finalRows = [colLabels] + newRows 168 | print indent(finalRows,separateRows=True,delim='|', prefix='|',postfix='|', justify='center',hasHeader=True) 169 | 170 | def prettyPrintQValues(gridWorld, qValues, currentState=None): 171 | grid = gridWorld.grid 172 | maxLen = 11 173 | newRows = [] 174 | for y in range(grid.height): 175 | newRow = [] 176 | for x in range(grid.width): 177 | state = (x, y) 178 | actions = gridWorld.getPossibleActions(state) 179 | if actions == None or len(actions) == 0: 180 | actions = [None] 181 | bestQ = max([qValues[(state, action)] for action in actions]) 182 | bestActions = [action for action in actions if qValues[(state, action)] == bestQ] 183 | 184 | # display cell 185 | qStrings = dict([(action, "%.2f" % qValues[(state, action)]) for action in actions]) 186 | northString = ('north' in qStrings and qStrings['north']) or ' ' 187 | southString = ('south' in qStrings and qStrings['south']) or ' ' 188 | eastString = ('east' in qStrings and qStrings['east']) or ' ' 189 | westString = ('west' in qStrings and qStrings['west']) or ' ' 190 | exitString = ('exit' in qStrings and qStrings['exit']) or ' ' 191 | 192 | eastLen = len(eastString) 193 | westLen = len(westString) 194 | if eastLen < westLen: 195 | eastString = ' '*(westLen-eastLen)+eastString 196 | if westLen < eastLen: 197 | westString = westString+' '*(eastLen-westLen) 198 | 199 | if 'north' in bestActions: 200 | northString = '/'+northString+'\\' 201 | if 'south' in bestActions: 202 | southString = '\\'+southString+'/' 203 | if 'east' in bestActions: 204 | eastString = ''+eastString+'>' 205 | else: 206 | eastString = ''+eastString+' ' 207 | if 'west' in bestActions: 208 | westString = '<'+westString+'' 209 | else: 210 | westString = ' '+westString+'' 211 | if 'exit' in bestActions: 212 | exitString = '[ '+exitString+' ]' 213 | 214 | 215 | ewString = westString + " " + eastString 216 | if state == currentState: 217 | ewString = westString + " * " + eastString 218 | if state == gridWorld.getStartState(): 219 | ewString = westString + " S " + eastString 220 | if state == currentState and state == gridWorld.getStartState(): 221 | ewString = westString + " S:* " + eastString 222 | 223 | text = [northString, "\n"+exitString, ewString, ' '*maxLen+"\n", southString] 224 | 225 | if grid[x][y] == '#': 226 | text = ['', '\n#####\n#####\n#####', ''] 227 | 228 | newCell = "\n".join(text) 229 | newRow.append(newCell) 230 | newRows.append(newRow) 231 | numCols = grid.width 232 | for rowNum, row in enumerate(newRows): 233 | row.insert(0,"\n\n\n"+str(rowNum)) 234 | newRows.reverse() 235 | colLabels = [str(colNum) for colNum in range(numCols)] 236 | colLabels.insert(0,' ') 237 | finalRows = [colLabels] + newRows 238 | 239 | print indent(finalRows,separateRows=True,delim='|',prefix='|',postfix='|', justify='center',hasHeader=True) 240 | 241 | def border(text): 242 | length = len(text) 243 | pieces = ['-' * (length+2), '|'+' ' * (length+2)+'|', ' | '+text+' | ', '|'+' ' * (length+2)+'|','-' * (length+2)] 244 | return '\n'.join(pieces) 245 | 246 | # INDENTING CODE 247 | 248 | # Indenting code based on a post from George Sakkis 249 | # (http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/267662) 250 | 251 | import cStringIO,operator 252 | 253 | def indent(rows, hasHeader=False, headerChar='-', delim=' | ', justify='left', 254 | separateRows=False, prefix='', postfix='', wrapfunc=lambda x:x): 255 | """Indents a table by column. 256 | - rows: A sequence of sequences of items, one sequence per row. 257 | - hasHeader: True if the first row consists of the columns' names. 258 | - headerChar: Character to be used for the row separator line 259 | (if hasHeader==True or separateRows==True). 260 | - delim: The column delimiter. 261 | - justify: Determines how are data justified in their column. 262 | Valid values are 'left','right' and 'center'. 263 | - separateRows: True if rows are to be separated by a line 264 | of 'headerChar's. 265 | - prefix: A string prepended to each printed row. 266 | - postfix: A string appended to each printed row. 267 | - wrapfunc: A function f(text) for wrapping text; each element in 268 | the table is first wrapped by this function.""" 269 | # closure for breaking logical rows to physical, using wrapfunc 270 | def rowWrapper(row): 271 | newRows = [wrapfunc(item).split('\n') for item in row] 272 | return [[substr or '' for substr in item] for item in map(None,*newRows)] 273 | # break each logical row into one or more physical ones 274 | logicalRows = [rowWrapper(row) for row in rows] 275 | # columns of physical rows 276 | columns = map(None,*reduce(operator.add,logicalRows)) 277 | # get the maximum of each column by the string length of its items 278 | maxWidths = [max([len(str(item)) for item in column]) for column in columns] 279 | rowSeparator = headerChar * (len(prefix) + len(postfix) + sum(maxWidths) + \ 280 | len(delim)*(len(maxWidths)-1)) 281 | # select the appropriate justify method 282 | justify = {'center':str.center, 'right':str.rjust, 'left':str.ljust}[justify.lower()] 283 | output=cStringIO.StringIO() 284 | if separateRows: print >> output, rowSeparator 285 | for physicalRows in logicalRows: 286 | for row in physicalRows: 287 | print >> output, \ 288 | prefix \ 289 | + delim.join([justify(str(item),width) for (item,width) in zip(row,maxWidths)]) \ 290 | + postfix 291 | if separateRows or hasHeader: print >> output, rowSeparator; hasHeader=False 292 | return output.getvalue() 293 | 294 | import math 295 | def wrap_always(text, width): 296 | """A simple word-wrap function that wraps text on exactly width characters. 297 | It doesn't split the text in words.""" 298 | return '\n'.join([ text[width*i:width*(i+1)] \ 299 | for i in xrange(int(math.ceil(1.*len(text)/width))) ]) 300 | 301 | 302 | # TEST OF DISPLAY CODE 303 | 304 | if __name__ == '__main__': 305 | import gridworld, util 306 | 307 | grid = gridworld.getCliffGrid3() 308 | print grid.getStates() 309 | 310 | policy = dict([(state,'east') for state in grid.getStates()]) 311 | values = util.Counter(dict([(state,1000.23) for state in grid.getStates()])) 312 | prettyPrintValues(grid, values, policy, currentState = (0,0)) 313 | 314 | stateCrossActions = [[(state, action) for action in grid.getPossibleActions(state)] for state in grid.getStates()] 315 | qStates = reduce(lambda x,y: x+y, stateCrossActions, []) 316 | qValues = util.Counter(dict([((state, action), 10.5) for state, action in qStates])) 317 | qValues = util.Counter(dict([((state, action), 10.5) for state, action in reduce(lambda x,y: x+y, stateCrossActions, [])])) 318 | prettyPrintQValues(grid, qValues, currentState = (0,0)) 319 | -------------------------------------------------------------------------------- /week2/assignment/graphicsGridworldDisplay.py: -------------------------------------------------------------------------------- 1 | # graphicsGridworldDisplay.py 2 | # --------------------------- 3 | # Licensing Information: Please do not distribute or publish solutions to this 4 | # project. You are free to use and extend these projects for educational 5 | # purposes. The Pacman AI projects were developed at UC Berkeley, primarily by 6 | # John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). 7 | # For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 8 | 9 | import util 10 | from graphicsUtils import * 11 | 12 | class GraphicsGridworldDisplay: 13 | 14 | def __init__(self, gridworld, size=120, speed=1.0): 15 | self.gridworld = gridworld 16 | self.size = size 17 | self.speed = speed 18 | 19 | def start(self): 20 | setup(self.gridworld, size=self.size) 21 | 22 | def pause(self): 23 | wait_for_keys() 24 | 25 | def displayValues(self, agent, currentState = None, message = 'Agent Values'): 26 | values = util.Counter() 27 | policy = {} 28 | states = self.gridworld.getStates() 29 | for state in states: 30 | values[state] = agent.getValue(state) 31 | policy[state] = agent.getPolicy(state) 32 | drawValues(self.gridworld, values, policy, currentState, message) 33 | sleep(0.05 / self.speed) 34 | 35 | def displayNullValues(self, currentState = None, message = ''): 36 | values = util.Counter() 37 | #policy = {} 38 | states = self.gridworld.getStates() 39 | for state in states: 40 | values[state] = 0.0 41 | #policy[state] = agent.getPolicy(state) 42 | drawNullValues(self.gridworld, currentState,'') 43 | # drawValues(self.gridworld, values, policy, currentState, message) 44 | sleep(0.05 / self.speed) 45 | 46 | def displayQValues(self, agent, currentState = None, message = 'Agent Q-Values'): 47 | qValues = util.Counter() 48 | states = self.gridworld.getStates() 49 | for state in states: 50 | for action in self.gridworld.getPossibleActions(state): 51 | qValues[(state, action)] = agent.getQValue(state, action) 52 | drawQValues(self.gridworld, qValues, currentState, message) 53 | sleep(0.05 / self.speed) 54 | 55 | BACKGROUND_COLOR = formatColor(0,0,0) 56 | EDGE_COLOR = formatColor(1,1,1) 57 | OBSTACLE_COLOR = formatColor(0.5,0.5,0.5) 58 | TEXT_COLOR = formatColor(1,1,1) 59 | MUTED_TEXT_COLOR = formatColor(0.7,0.7,0.7) 60 | LOCATION_COLOR = formatColor(0,0,1) 61 | 62 | WINDOW_SIZE = -1 63 | GRID_SIZE = -1 64 | GRID_HEIGHT = -1 65 | MARGIN = -1 66 | 67 | def setup(gridworld, title = "Gridworld Display", size = 120): 68 | global GRID_SIZE, MARGIN, SCREEN_WIDTH, SCREEN_HEIGHT, GRID_HEIGHT 69 | grid = gridworld.grid 70 | WINDOW_SIZE = size 71 | GRID_SIZE = size 72 | GRID_HEIGHT = grid.height 73 | MARGIN = GRID_SIZE * 0.75 74 | screen_width = (grid.width - 1) * GRID_SIZE + MARGIN * 2 75 | screen_height = (grid.height - 0.5) * GRID_SIZE + MARGIN * 2 76 | 77 | begin_graphics(screen_width, 78 | screen_height, 79 | BACKGROUND_COLOR, title=title) 80 | 81 | def drawNullValues(gridworld, currentState = None, message = ''): 82 | grid = gridworld.grid 83 | blank() 84 | for x in range(grid.width): 85 | for y in range(grid.height): 86 | state = (x, y) 87 | gridType = grid[x][y] 88 | isExit = (str(gridType) != gridType) 89 | isCurrent = (currentState == state) 90 | if gridType == '#': 91 | drawSquare(x, y, 0, 0, 0, None, None, True, False, isCurrent) 92 | else: 93 | drawNullSquare(gridworld.grid, x, y, False, isExit, isCurrent) 94 | pos = to_screen(((grid.width - 1.0) / 2.0, - 0.8)) 95 | text( pos, TEXT_COLOR, message, "Courier", -32, "bold", "c") 96 | 97 | 98 | def drawValues(gridworld, values, policy, currentState = None, message = 'State Values'): 99 | grid = gridworld.grid 100 | blank() 101 | valueList = [values[state] for state in gridworld.getStates()] + [0.0] 102 | minValue = min(valueList) 103 | maxValue = max(valueList) 104 | for x in range(grid.width): 105 | for y in range(grid.height): 106 | state = (x, y) 107 | gridType = grid[x][y] 108 | isExit = (str(gridType) != gridType) 109 | isCurrent = (currentState == state) 110 | if gridType == '#': 111 | drawSquare(x, y, 0, 0, 0, None, None, True, False, isCurrent) 112 | else: 113 | value = values[state] 114 | action = None 115 | if policy != None and state in policy: 116 | action = policy[state] 117 | actions = gridworld.getPossibleActions(state) 118 | if action not in actions and 'exit' in actions: 119 | action = 'exit' 120 | valString = '%.2f' % value 121 | drawSquare(x, y, value, minValue, maxValue, valString, action, False, isExit, isCurrent) 122 | pos = to_screen(((grid.width - 1.0) / 2.0, - 0.8)) 123 | text( pos, TEXT_COLOR, message, "Courier", -32, "bold", "c") 124 | 125 | def drawQValues(gridworld, qValues, currentState = None, message = 'State-Action Q-Values'): 126 | grid = gridworld.grid 127 | blank() 128 | stateCrossActions = [[(state, action) for action in gridworld.getPossibleActions(state)] for state in gridworld.getStates()] 129 | qStates = reduce(lambda x,y: x+y, stateCrossActions, []) 130 | qValueList = [qValues[(state, action)] for state, action in qStates] + [0.0] 131 | minValue = min(qValueList) 132 | maxValue = max(qValueList) 133 | for x in range(grid.width): 134 | for y in range(grid.height): 135 | state = (x, y) 136 | gridType = grid[x][y] 137 | isExit = (str(gridType) != gridType) 138 | isCurrent = (currentState == state) 139 | actions = gridworld.getPossibleActions(state) 140 | if actions == None or len(actions) == 0: 141 | actions = [None] 142 | bestQ = max([qValues[(state, action)] for action in actions]) 143 | bestActions = [action for action in actions if qValues[(state, action)] == bestQ] 144 | 145 | q = util.Counter() 146 | valStrings = {} 147 | for action in actions: 148 | v = qValues[(state, action)] 149 | q[action] += v 150 | valStrings[action] = '%.2f' % v 151 | if gridType == '#': 152 | drawSquare(x, y, 0, 0, 0, None, None, True, False, isCurrent) 153 | elif isExit: 154 | action = 'exit' 155 | value = q[action] 156 | valString = '%.2f' % value 157 | drawSquare(x, y, value, minValue, maxValue, valString, action, False, isExit, isCurrent) 158 | else: 159 | drawSquareQ(x, y, q, minValue, maxValue, valStrings, actions, isCurrent) 160 | pos = to_screen(((grid.width - 1.0) / 2.0, - 0.8)) 161 | text( pos, TEXT_COLOR, message, "Courier", -32, "bold", "c") 162 | 163 | 164 | def blank(): 165 | clear_screen() 166 | 167 | def drawNullSquare(grid,x, y, isObstacle, isTerminal, isCurrent): 168 | 169 | square_color = getColor(0, -1, 1) 170 | 171 | if isObstacle: 172 | square_color = OBSTACLE_COLOR 173 | 174 | (screen_x, screen_y) = to_screen((x, y)) 175 | square( (screen_x, screen_y), 176 | 0.5* GRID_SIZE, 177 | color = square_color, 178 | filled = 1, 179 | width = 1) 180 | 181 | square( (screen_x, screen_y), 182 | 0.5* GRID_SIZE, 183 | color = EDGE_COLOR, 184 | filled = 0, 185 | width = 3) 186 | 187 | if isTerminal and not isObstacle: 188 | square( (screen_x, screen_y), 189 | 0.4* GRID_SIZE, 190 | color = EDGE_COLOR, 191 | filled = 0, 192 | width = 2) 193 | text( (screen_x, screen_y), 194 | TEXT_COLOR, 195 | str(grid[x][y]), 196 | "Courier", -24, "bold", "c") 197 | 198 | 199 | text_color = TEXT_COLOR 200 | 201 | if not isObstacle and isCurrent: 202 | circle( (screen_x, screen_y), 0.1*GRID_SIZE, LOCATION_COLOR, fillColor=LOCATION_COLOR ) 203 | 204 | # if not isObstacle: 205 | # text( (screen_x, screen_y), text_color, valStr, "Courier", 24, "bold", "c") 206 | 207 | def drawSquare(x, y, val, min, max, valStr, action, isObstacle, isTerminal, isCurrent): 208 | 209 | square_color = getColor(val, min, max) 210 | 211 | if isObstacle: 212 | square_color = OBSTACLE_COLOR 213 | 214 | (screen_x, screen_y) = to_screen((x, y)) 215 | square( (screen_x, screen_y), 216 | 0.5* GRID_SIZE, 217 | color = square_color, 218 | filled = 1, 219 | width = 1) 220 | square( (screen_x, screen_y), 221 | 0.5* GRID_SIZE, 222 | color = EDGE_COLOR, 223 | filled = 0, 224 | width = 3) 225 | if isTerminal and not isObstacle: 226 | square( (screen_x, screen_y), 227 | 0.4* GRID_SIZE, 228 | color = EDGE_COLOR, 229 | filled = 0, 230 | width = 2) 231 | 232 | 233 | if action == 'north': 234 | polygon( [(screen_x, screen_y - 0.45*GRID_SIZE), (screen_x+0.05*GRID_SIZE, screen_y-0.40*GRID_SIZE), (screen_x-0.05*GRID_SIZE, screen_y-0.40*GRID_SIZE)], EDGE_COLOR, filled = 1, smoothed = False) 235 | if action == 'south': 236 | polygon( [(screen_x, screen_y + 0.45*GRID_SIZE), (screen_x+0.05*GRID_SIZE, screen_y+0.40*GRID_SIZE), (screen_x-0.05*GRID_SIZE, screen_y+0.40*GRID_SIZE)], EDGE_COLOR, filled = 1, smoothed = False) 237 | if action == 'west': 238 | polygon( [(screen_x-0.45*GRID_SIZE, screen_y), (screen_x-0.4*GRID_SIZE, screen_y+0.05*GRID_SIZE), (screen_x-0.4*GRID_SIZE, screen_y-0.05*GRID_SIZE)], EDGE_COLOR, filled = 1, smoothed = False) 239 | if action == 'east': 240 | polygon( [(screen_x+0.45*GRID_SIZE, screen_y), (screen_x+0.4*GRID_SIZE, screen_y+0.05*GRID_SIZE), (screen_x+0.4*GRID_SIZE, screen_y-0.05*GRID_SIZE)], EDGE_COLOR, filled = 1, smoothed = False) 241 | 242 | 243 | text_color = TEXT_COLOR 244 | 245 | if not isObstacle and isCurrent: 246 | circle( (screen_x, screen_y), 0.1*GRID_SIZE, outlineColor=LOCATION_COLOR, fillColor=LOCATION_COLOR ) 247 | 248 | if not isObstacle: 249 | text( (screen_x, screen_y), text_color, valStr, "Courier", -30, "bold", "c") 250 | 251 | 252 | def drawSquareQ(x, y, qVals, minVal, maxVal, valStrs, bestActions, isCurrent): 253 | 254 | (screen_x, screen_y) = to_screen((x, y)) 255 | 256 | center = (screen_x, screen_y) 257 | nw = (screen_x-0.5*GRID_SIZE, screen_y-0.5*GRID_SIZE) 258 | ne = (screen_x+0.5*GRID_SIZE, screen_y-0.5*GRID_SIZE) 259 | se = (screen_x+0.5*GRID_SIZE, screen_y+0.5*GRID_SIZE) 260 | sw = (screen_x-0.5*GRID_SIZE, screen_y+0.5*GRID_SIZE) 261 | n = (screen_x, screen_y-0.5*GRID_SIZE+5) 262 | s = (screen_x, screen_y+0.5*GRID_SIZE-5) 263 | w = (screen_x-0.5*GRID_SIZE+5, screen_y) 264 | e = (screen_x+0.5*GRID_SIZE-5, screen_y) 265 | 266 | actions = qVals.keys() 267 | for action in actions: 268 | 269 | wedge_color = getColor(qVals[action], minVal, maxVal) 270 | 271 | if action == 'north': 272 | polygon( (center, nw, ne), wedge_color, filled = 1, smoothed = False) 273 | #text(n, text_color, valStr, "Courier", 8, "bold", "n") 274 | if action == 'south': 275 | polygon( (center, sw, se), wedge_color, filled = 1, smoothed = False) 276 | #text(s, text_color, valStr, "Courier", 8, "bold", "s") 277 | if action == 'east': 278 | polygon( (center, ne, se), wedge_color, filled = 1, smoothed = False) 279 | #text(e, text_color, valStr, "Courier", 8, "bold", "e") 280 | if action == 'west': 281 | polygon( (center, nw, sw), wedge_color, filled = 1, smoothed = False) 282 | #text(w, text_color, valStr, "Courier", 8, "bold", "w") 283 | 284 | square( (screen_x, screen_y), 285 | 0.5* GRID_SIZE, 286 | color = EDGE_COLOR, 287 | filled = 0, 288 | width = 3) 289 | line(ne, sw, color = EDGE_COLOR) 290 | line(nw, se, color = EDGE_COLOR) 291 | 292 | if isCurrent: 293 | circle( (screen_x, screen_y), 0.1*GRID_SIZE, LOCATION_COLOR, fillColor=LOCATION_COLOR ) 294 | 295 | for action in actions: 296 | text_color = TEXT_COLOR 297 | if qVals[action] < max(qVals.values()): text_color = MUTED_TEXT_COLOR 298 | valStr = "" 299 | if action in valStrs: 300 | valStr = valStrs[action] 301 | h = -20 302 | if action == 'north': 303 | #polygon( (center, nw, ne), wedge_color, filled = 1, smooth = 0) 304 | text(n, text_color, valStr, "Courier", h, "bold", "n") 305 | if action == 'south': 306 | #polygon( (center, sw, se), wedge_color, filled = 1, smooth = 0) 307 | text(s, text_color, valStr, "Courier", h, "bold", "s") 308 | if action == 'east': 309 | #polygon( (center, ne, se), wedge_color, filled = 1, smooth = 0) 310 | text(e, text_color, valStr, "Courier", h, "bold", "e") 311 | if action == 'west': 312 | #polygon( (center, nw, sw), wedge_color, filled = 1, smooth = 0) 313 | text(w, text_color, valStr, "Courier", h, "bold", "w") 314 | 315 | 316 | def getColor(val, minVal, max): 317 | r, g = 0.0, 0.0 318 | if val < 0 and minVal < 0: 319 | r = val * 0.65 / minVal 320 | if val > 0 and max > 0: 321 | g = val * 0.65 / max 322 | return formatColor(r,g,0.0) 323 | 324 | 325 | def square(pos, size, color, filled, width): 326 | x, y = pos 327 | dx, dy = size, size 328 | return polygon([(x - dx, y - dy), (x - dx, y + dy), (x + dx, y + dy), (x + dx, y - dy)], outlineColor=color, fillColor=color, filled=filled, width=width, smoothed=False) 329 | 330 | 331 | def to_screen(point): 332 | ( gamex, gamey ) = point 333 | x = gamex*GRID_SIZE + MARGIN 334 | y = (GRID_HEIGHT - gamey - 1)*GRID_SIZE + MARGIN 335 | return ( x, y ) 336 | 337 | def to_grid(point): 338 | (x, y) = point 339 | x = int ((y - MARGIN + GRID_SIZE * 0.5) / GRID_SIZE) 340 | y = int ((x - MARGIN + GRID_SIZE * 0.5) / GRID_SIZE) 341 | print point, "-->", (x, y) 342 | return (x, y) 343 | --------------------------------------------------------------------------------