├── __init__.py ├── lib ├── __init__.py ├── envs │ ├── __init__.py │ ├── discrete.py │ ├── windy_gridworld.py │ ├── cliff_walking.py │ ├── gridworld.py │ └── blackjack.py ├── atari │ ├── __init__.py │ ├── helpers.py │ └── state_processor.py └── plotting.py ├── DQN ├── .gitignore ├── README.md └── dqn.py ├── PolicyGradient ├── a3c │ ├── README.md │ ├── policy_monitor_test.py │ ├── worker_test.py │ ├── policy_monitor.py │ ├── estimator_test.py │ ├── train.py │ ├── estimators.py │ └── worker.py ├── README.md └── Continuous MountainCar Actor Critic Solution.ipynb ├── LICENSE ├── Introduction └── README.md ├── .gitignore ├── FA └── README.md ├── DP ├── README.md ├── Gamblers Problem.ipynb ├── Policy Evaluation Solution.ipynb ├── Value Iteration Solution.ipynb ├── Policy Evaluation.ipynb ├── Policy Iteration Solution.ipynb ├── Value Iteration.ipynb └── Policy Iteration.ipynb ├── TD ├── README.md ├── Cliff Environment Playground.ipynb └── Windy Gridworld Playground.ipynb ├── MDP └── README.md ├── MC ├── README.md ├── MC Prediction.ipynb ├── MC Control with Epsilon-Greedy Policies.ipynb ├── Off-Policy MC Control with Weighted Importance Sampling.ipynb └── Blackjack Playground.ipynb └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DQN/.gitignore: -------------------------------------------------------------------------------- 1 | experiments/ -------------------------------------------------------------------------------- /lib/atari/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /PolicyGradient/a3c/README.md: -------------------------------------------------------------------------------- 1 | ## Implementation of A3C (Asynchronous Advantage Actor-Critic) 2 | 3 | #### Running 4 | 5 | ``` 6 | ./train.py --model_dir /tmp/a3c --env Breakout-v0 --t_max 5 --eval_every 300 --parallelism 8 7 | ``` 8 | 9 | See `./train.py --help` for a full list of options. Then, monitor training progress in Tensorboard: 10 | 11 | ``` 12 | tensorboard --logdir=/tmp/a3c 13 | ``` 14 | 15 | #### Components 16 | 17 | - [`train.py`](train.py) contains the main method to start training. 18 | - [`estimators.py`](estimators.py) contains the Tensorflow graph definitions for the Policy and Value networks. 19 | - [`worker.py`](worker.py) contains code that runs in each worker threads. 20 | - [`policy_monitor.py`](policy_monitor.py) contains code that evaluates the policy network by running an episode and saving rewards to Tensorboard. 21 | -------------------------------------------------------------------------------- /lib/atari/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class AtariEnvWrapper(object): 4 | """ 5 | Wraps an Atari environment to end an episode when a life is lost. 6 | """ 7 | def __init__(self, env): 8 | self.env = env 9 | 10 | def __getattr__(self, name): 11 | return getattr(self.env, name) 12 | 13 | def step(self, *args, **kwargs): 14 | lives_before = self.env.ale.lives() 15 | next_state, reward, done, info = self.env.step(*args, **kwargs) 16 | lives_after = self.env.ale.lives() 17 | 18 | # End the episode when a life is lost 19 | if lives_before > lives_after: 20 | done = True 21 | 22 | # Clip rewards to [-1,1] 23 | reward = max(min(reward, 1), -1) 24 | 25 | return next_state, reward, done, info 26 | 27 | def atari_make_initial_state(state): 28 | return np.stack([state] * 4, axis=2) 29 | 30 | def atari_make_next_state(state, next_state): 31 | return np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Denny Britz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /lib/atari/state_processor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | class StateProcessor(): 5 | """ 6 | Processes a raw Atari iamges. Resizes it and converts it to grayscale. 7 | """ 8 | def __init__(self): 9 | # Build the Tensorflow graph 10 | with tf.variable_scope("state_processor"): 11 | self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8) 12 | self.output = tf.image.rgb_to_grayscale(self.input_state) 13 | self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160) 14 | self.output = tf.image.resize_images( 15 | self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) 16 | self.output = tf.squeeze(self.output) 17 | 18 | def process(self, state, sess=None): 19 | """ 20 | Args: 21 | sess: A Tensorflow session object 22 | state: A [210, 160, 3] Atari RGB State 23 | 24 | Returns: 25 | A processed [84, 84, 1] state representing grayscale values. 26 | """ 27 | sess = sess or tf.get_default_session() 28 | return sess.run(self.output, { self.input_state: state }) -------------------------------------------------------------------------------- /Introduction/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | ### Learning Goals 4 | 5 | - Understand the Reinforcement Learning problem and how it differs from Supervised Learning 6 | 7 | 8 | ### Summary 9 | 10 | - Reinforcement Learning (RL) is concerned with goal-directed learning and decision-making. 11 | - In RL an agent learns from experiences it gains by interacting with the environment. In Supervised Learning we cannot affect the environment. 12 | - In RL rewards are often delayed in time and the agent tries to maximize a long-term goal. For example, one may need to make seemingly suboptimal moves to reach a winning position in a game. 13 | - An agent interacts with the environment via states, actions and rewards. 14 | 15 | 16 | ### Lectures & Readings 17 | 18 | **Required:** 19 | 20 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 1: The Reinforcement Learning Problem 21 | - David Silver's RL Course Lecture 1 - Introduction to Reinforcement Learning ([video](https://www.youtube.com/watch?v=2pWv7GOvuf0), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/intro_RL.pdf)) 22 | - [OpenAI Gym Tutorial](https://gym.openai.com/docs) 23 | 24 | **Optional:** 25 | 26 | N/A 27 | 28 | 29 | ### Exercises 30 | 31 | - [Work through the OpenAI Gym Tutorial](https://gym.openai.com/docs) 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python ### 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | experiments/ 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IPython Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | 92 | 93 | ### IPythonNotebook ### 94 | # Temporary data 95 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /lib/envs/discrete.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gym import Env, spaces 4 | from gym.utils import seeding 5 | from gym.envs.toy_text.utils import categorical_sample 6 | 7 | class DiscreteEnv(Env): 8 | 9 | """ 10 | Has the following members 11 | - nS: number of states 12 | - nA: number of actions 13 | - P: transitions (*) 14 | - isd: initial state distribution (**) 15 | 16 | (*) dictionary of lists, where 17 | P[s][a] == [(probability, nextstate, reward, done), ...] 18 | (**) list or array of length nS 19 | 20 | 21 | """ 22 | 23 | def __init__(self, nS, nA, P, isd): 24 | self.P = P 25 | self.isd = isd 26 | self.lastaction = None # for rendering 27 | self.nS = nS 28 | self.nA = nA 29 | 30 | self.action_space = spaces.Discrete(self.nA) 31 | self.observation_space = spaces.Discrete(self.nS) 32 | 33 | self.seed() 34 | self.s = categorical_sample(self.isd, self.np_random) 35 | 36 | def seed(self, seed=None): 37 | self.np_random, seed = seeding.np_random(seed) 38 | return [seed] 39 | 40 | def reset(self): 41 | self.s = categorical_sample(self.isd, self.np_random) 42 | self.lastaction = None 43 | return int(self.s) 44 | 45 | def step(self, a): 46 | transitions = self.P[self.s][a] 47 | i = categorical_sample([t[0] for t in transitions], self.np_random) 48 | p, s, r, d = transitions[i] 49 | self.s = s 50 | self.lastaction = a 51 | return (int(s), r, d, {"prob": p}) 52 | -------------------------------------------------------------------------------- /PolicyGradient/a3c/policy_monitor_test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import sys 3 | import os 4 | import itertools 5 | import collections 6 | import unittest 7 | import numpy as np 8 | import tensorflow as tf 9 | import tempfile 10 | 11 | from inspect import getsourcefile 12 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0))) 13 | import_path = os.path.abspath(os.path.join(current_path, "../..")) 14 | 15 | if import_path not in sys.path: 16 | sys.path.append(import_path) 17 | 18 | # from lib import plotting 19 | from lib.atari.state_processor import StateProcessor 20 | from lib.atari import helpers as atari_helpers 21 | from policy_monitor import PolicyMonitor 22 | from estimators import ValueEstimator, PolicyEstimator 23 | 24 | def make_env(): 25 | return gym.envs.make("Breakout-v0") 26 | 27 | VALID_ACTIONS = [0, 1, 2, 3] 28 | 29 | class PolicyMonitorTest(tf.test.TestCase): 30 | def setUp(self): 31 | super(PolicyMonitorTest, self).setUp() 32 | 33 | self.env = make_env() 34 | self.global_step = tf.Variable(0, name="global_step", trainable=False) 35 | self.summary_writer = tf.train.SummaryWriter(tempfile.mkdtemp()) 36 | 37 | with tf.variable_scope("global") as vs: 38 | self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS)) 39 | self.global_value_net = ValueEstimator(reuse=True) 40 | 41 | def testEvalOnce(self): 42 | pe = PolicyMonitor( 43 | env=self.env, 44 | policy_net=self.global_policy_net, 45 | summary_writer=self.summary_writer) 46 | 47 | with self.test_session() as sess: 48 | sess.run(tf.initialize_all_variables()) 49 | total_reward, episode_length = pe.eval_once(sess) 50 | self.assertTrue(episode_length > 0) 51 | 52 | 53 | if __name__ == '__main__': 54 | unittest.main() -------------------------------------------------------------------------------- /FA/README.md: -------------------------------------------------------------------------------- 1 | ## Function Approximation 2 | 3 | ### Learning Goals 4 | 5 | - Understand the motivation for Function Approximation over Table Lookup 6 | - Understand how to incorporate function approximation into existing algorithms 7 | - Understand convergence properties of function approximators and RL algorithms 8 | - Understand batching using experience replay 9 | 10 | 11 | ### Summary 12 | 13 | - Building a big table, one value for each state or state-action pair, is memory- and data-inefficient. Function Approximation can generalize to unseen states by using a featurized state representation. 14 | - Treat RL as supervised learning problem with the MC- or TD-target as the label and the current state/action as the input. Often the target also depends on the function estimator but we simply ignore its gradient. That's why these methods are called semi-gradient methods. 15 | - Challenge: We have non-stationary (policy changes, bootstrapping) and non-iid (correlated in time) data. 16 | - Many methods assume that our action space is discrete because they rely on calculating the argmax over all actions. Large and continuous action spaces are ongoing research. 17 | - For Control very few convergence guarantees exist. For non-linear approximators there are basically no guarantees at all. But they tend to work in practice. 18 | - Experience Replay: Store experience as dataset, randomize it, and repeatedly apply minibatch SGD. 19 | - Tricks to stabilize non-linear function approximators: Fixed Targets. The target is calculated based on frozen parameter values from a previous time step. 20 | - For the non-episodic (continuing) case function approximation is more complex and we need to give up discounting and use an "average reward" formulation. 21 | 22 | 23 | ### Lectures & Readings 24 | 25 | **Required:** 26 | 27 | - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf)) 28 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 9: On-policy Prediction with Approximation 29 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 10: On-policy Control with Approximation 30 | 31 | **Optional:** 32 | 33 | - [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.youtube.com/watch?v=ggqnxyjaKe4) 34 | 35 | 36 | ### Exercises 37 | 38 | - Get familiar with the [Mountain Car Playground](MountainCar%20Playground.ipynb) 39 | 40 | - Solve Mountain Car Problem using Q-Learning with Linear Function Approximation 41 | - [Exercise](Q-Learning%20with%20Value%20Function%20Approximation.ipynb) 42 | - [Solution](Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb) 43 | -------------------------------------------------------------------------------- /DP/README.md: -------------------------------------------------------------------------------- 1 | ## Model-Based RL: Policy and Value Iteration using Dynamic Programming 2 | 3 | ### Learning Goals 4 | 5 | - Understand the difference between Policy Evaluation and Policy Improvement and how these processes interact 6 | - Understand the Policy Iteration Algorithm 7 | - Understand the Value Iteration Algorithm 8 | - Understand the Limitations of Dynamic Programming Approaches 9 | 10 | 11 | ### Summary 12 | 13 | - Dynamic Programming (DP) methods assume that we have a perfect model of the environment's Markov Decision Process (MDP). That's usually not the case in practice, but it's important to study DP anyway. 14 | - Policy Evaluation: Calculates the state-value function `V(s)` for a given policy. In DP this is done using a "full backup". At each state, we look ahead one step at each possible action and next state. We can only do this because we have a perfect model of the environment. 15 | - Full backups are basically the Bellman equations turned into updates. 16 | - Policy Improvement: Given the correct state-value function for a policy we can act greedily with respect to it (i.e. pick the best action at each state). Then we are guaranteed to improve the policy or keep it fixed if it's already optimal. 17 | - Policy Iteration: Iteratively perform Policy Evaluation and Policy Improvement until we reach the optimal policy. 18 | - Value Iteration: Instead of doing multiple steps of Policy Evaluation to find the "correct" V(s) we only do a single step and improve the policy immediately. In practice, this converges faster. 19 | - Generalized Policy Iteration: The process of iteratively doing policy evaluation and improvement. We can pick different algorithms for each of these steps but the basic idea stays the same. 20 | - DP methods bootstrap: They update estimates based on other estimates (one step ahead). 21 | 22 | 23 | ### Lectures & Readings 24 | 25 | **Required:** 26 | 27 | - David Silver's RL Course Lecture 3 - Planning by Dynamic Programming ([video](https://www.youtube.com/watch?v=Nd1-UUMVfz4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/DP.pdf)) 28 | 29 | **Optional:** 30 | 31 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 4: Dynamic Programming 32 | 33 | 34 | ### Exercises 35 | 36 | - Implement Policy Evaluation in Python (Gridworld) 37 | - [Exercise](Policy%20Evaluation.ipynb) 38 | - [Solution](Policy%20Evaluation%20Solution.ipynb) 39 | 40 | - Implement Policy Iteration in Python (Gridworld) 41 | - [Exercise](Policy%20Iteration.ipynb) 42 | - [Solution](Policy%20Iteration%20Solution.ipynb) 43 | 44 | - Implement Value Iteration in Python (Gridworld) 45 | - [Exercise](Value%20Iteration.ipynb) 46 | - [Solution](Value%20Iteration%20Solution.ipynb) 47 | 48 | - Implement Gambler's Problem 49 | - [Exercise](Gamblers%20Problem.ipynb) 50 | - [Solution](Gamblers%20Problem%20Solution.ipynb) -------------------------------------------------------------------------------- /TD/README.md: -------------------------------------------------------------------------------- 1 | ## Model-Free Prediction & Control with Temporal Difference (TD) and Q-Learning 2 | 3 | 4 | ### Learning Goals 5 | 6 | - Understand TD(0) for prediction 7 | - Understand SARSA for on-policy control 8 | - Understand Q-Learning for off-policy control 9 | - Understand the benefits of TD algorithms over MC and DP approaches 10 | - Understand how n-step methods unify MC and TD approaches 11 | - Understand the backward and forward view of TD-Lambda 12 | 13 | 14 | ### Summary 15 | 16 | - TD-Learning is a combination of Monte Carlo and Dynamic Programming ideas. Like Monte Carlo, TD works based on samples and doesn't require a model of the environment. Like Dynamic Programming, TD uses bootstrapping to make updates. 17 | - Whether MC or TD is better depends on the problem and there are no theoretical results that prove a clear winner. 18 | - General Update Rule: `Q[s,a] += learning_rate * (td_target - Q[s,a])`. `td_target - Q[s,a]` is also called the TD Error. 19 | - SARSA: On-Policy TD Control 20 | - TD Target for SARSA: `R[t+1] + discount_factor * Q[next_state][next_action]` 21 | - Q-Learning: Off-policy TD Control 22 | - TD Target for Q-Learning: `R[t+1] + discount_factor * max(Q[next_state])` 23 | - Q-Learning has a positive bias because it uses the maximum of estimated Q values to estimate the maximum action value, all from the same experience. Double Q-Learning gets around this by splitting the experience and using different Q functions for maximization and estimation. 24 | - N-Step methods unify MC and TD approaches. They making updates based on n-steps instead of a single step (TD-0) or a full episode (MC). 25 | 26 | 27 | ### Lectures & Readings 28 | 29 | **Required:** 30 | 31 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 6: Temporal-Difference Learning 32 | - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf)) 33 | - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf)) 34 | 35 | **Optional:** 36 | 37 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 7: Multi-Step Bootstrapping 38 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 12: Eligibility Traces 39 | 40 | 41 | ### Exercises 42 | 43 | - Get familiar with the [Windy Gridworld Playground](Windy%20Gridworld%20Playground.ipynb) 44 | - Implement SARSA 45 | - [Exercise](SARSA.ipynb) 46 | - [Solution](SARSA%20Solution.ipynb) 47 | - Get familiar with the [Cliff Environment Playground](Cliff%20Environment%20Playground.ipynb) 48 | - Implement Q-Learning in Python 49 | - [Exercise](Q-Learning.ipynb) 50 | - [Solution](Q-Learning%20Solution.ipynb) 51 | -------------------------------------------------------------------------------- /lib/envs/windy_gridworld.py: -------------------------------------------------------------------------------- 1 | import io 2 | import gym 3 | import numpy as np 4 | import sys 5 | 6 | from . import discrete 7 | 8 | UP = 0 9 | RIGHT = 1 10 | DOWN = 2 11 | LEFT = 3 12 | 13 | class WindyGridworldEnv(discrete.DiscreteEnv): 14 | 15 | metadata = {'render.modes': ['human', 'ansi']} 16 | 17 | def _limit_coordinates(self, coord): 18 | coord[0] = min(coord[0], self.shape[0] - 1) 19 | coord[0] = max(coord[0], 0) 20 | coord[1] = min(coord[1], self.shape[1] - 1) 21 | coord[1] = max(coord[1], 0) 22 | return coord 23 | 24 | def _calculate_transition_prob(self, current, delta, winds): 25 | new_position = np.array(current) + np.array(delta) + np.array([-1, 0]) * winds[tuple(current)] 26 | new_position = self._limit_coordinates(new_position).astype(int) 27 | new_state = np.ravel_multi_index(tuple(new_position), self.shape) 28 | is_done = tuple(new_position) == (3, 7) 29 | return [(1.0, new_state, -1.0, is_done)] 30 | 31 | def __init__(self): 32 | self.shape = (7, 10) 33 | 34 | nS = np.prod(self.shape) 35 | nA = 4 36 | 37 | # Wind strength 38 | winds = np.zeros(self.shape) 39 | winds[:,[3,4,5,8]] = 1 40 | winds[:,[6,7]] = 2 41 | 42 | # Calculate transition probabilities 43 | P = {} 44 | for s in range(nS): 45 | position = np.unravel_index(s, self.shape) 46 | P[s] = { a : [] for a in range(nA) } 47 | P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds) 48 | P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds) 49 | P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds) 50 | P[s][LEFT] = self._calculate_transition_prob(position, [0, -1], winds) 51 | 52 | # We always start in state (3, 0) 53 | isd = np.zeros(nS) 54 | isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 55 | 56 | super(WindyGridworldEnv, self).__init__(nS, nA, P, isd) 57 | 58 | def render(self, mode='human', close=False): 59 | self._render(mode, close) 60 | 61 | def _render(self, mode='human', close=False): 62 | if close: 63 | return 64 | 65 | outfile = io.StringIO() if mode == 'ansi' else sys.stdout 66 | 67 | for s in range(self.nS): 68 | position = np.unravel_index(s, self.shape) 69 | # print(self.s) 70 | if self.s == s: 71 | output = " x " 72 | elif position == (3,7): 73 | output = " T " 74 | else: 75 | output = " o " 76 | 77 | if position[1] == 0: 78 | output = output.lstrip() 79 | if position[1] == self.shape[1] - 1: 80 | output = output.rstrip() 81 | output += "\n" 82 | 83 | outfile.write(output) 84 | outfile.write("\n") 85 | -------------------------------------------------------------------------------- /lib/envs/cliff_walking.py: -------------------------------------------------------------------------------- 1 | import io 2 | import numpy as np 3 | import sys 4 | 5 | from . import discrete 6 | 7 | UP = 0 8 | RIGHT = 1 9 | DOWN = 2 10 | LEFT = 3 11 | 12 | class CliffWalkingEnv(discrete.DiscreteEnv): 13 | 14 | metadata = {'render.modes': ['human', 'ansi']} 15 | 16 | def _limit_coordinates(self, coord): 17 | coord[0] = min(coord[0], self.shape[0] - 1) 18 | coord[0] = max(coord[0], 0) 19 | coord[1] = min(coord[1], self.shape[1] - 1) 20 | coord[1] = max(coord[1], 0) 21 | return coord 22 | 23 | def _calculate_transition_prob(self, current, delta): 24 | new_position = np.array(current) + np.array(delta) 25 | new_position = self._limit_coordinates(new_position).astype(int) 26 | new_state = np.ravel_multi_index(tuple(new_position), self.shape) 27 | reward = -100.0 if self._cliff[tuple(new_position)] else -1.0 28 | is_done = self._cliff[tuple(new_position)] or (tuple(new_position) == (3,11)) 29 | return [(1.0, new_state, reward, is_done)] 30 | 31 | def __init__(self): 32 | self.shape = (4, 12) 33 | 34 | nS = np.prod(self.shape) 35 | nA = 4 36 | 37 | # Cliff Location 38 | self._cliff = np.zeros(self.shape, dtype=np.bool) 39 | self._cliff[3, 1:-1] = True 40 | 41 | # Calculate transition probabilities 42 | P = {} 43 | for s in range(nS): 44 | position = np.unravel_index(s, self.shape) 45 | P[s] = { a : [] for a in range(nA) } 46 | P[s][UP] = self._calculate_transition_prob(position, [-1, 0]) 47 | P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1]) 48 | P[s][DOWN] = self._calculate_transition_prob(position, [1, 0]) 49 | P[s][LEFT] = self._calculate_transition_prob(position, [0, -1]) 50 | 51 | # We always start in state (3, 0) 52 | isd = np.zeros(nS) 53 | isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 54 | 55 | super(CliffWalkingEnv, self).__init__(nS, nA, P, isd) 56 | 57 | def render(self, mode='human', close=False): 58 | self._render(mode, close) 59 | 60 | def _render(self, mode='human', close=False): 61 | if close: 62 | return 63 | 64 | outfile = io.StringIO() if mode == 'ansi' else sys.stdout 65 | 66 | for s in range(self.nS): 67 | position = np.unravel_index(s, self.shape) 68 | # print(self.s) 69 | if self.s == s: 70 | output = " x " 71 | elif position == (3,11): 72 | output = " T " 73 | elif self._cliff[position]: 74 | output = " C " 75 | else: 76 | output = " o " 77 | 78 | if position[1] == 0: 79 | output = output.lstrip() 80 | if position[1] == self.shape[1] - 1: 81 | output = output.rstrip() 82 | output += "\n" 83 | 84 | outfile.write(output) 85 | outfile.write("\n") 86 | -------------------------------------------------------------------------------- /DQN/README.md: -------------------------------------------------------------------------------- 1 | ## Deep Q-Learning 2 | 3 | ### Learning Goals 4 | 5 | - Understand the Deep Q-Learning (DQN) algorithm 6 | - Understand why Experience Replay and a Target Network are necessary to make Deep Q-Learning work in practice 7 | - (Optional) Understand Double Deep Q-Learning 8 | - (Optional) Understand Prioritized Experience Replay 9 | 10 | 11 | ### Summary 12 | 13 | - DQN: Q-Learning but with a Deep Neural Network as a function approximator. 14 | - Using a non-linear Deep Neural Network is powerful, but training is unstable if we apply it naively. 15 | - Trick 1 - Experience Replay: Store experience `(S, A, R, S_next)` in a replay buffer and sample minibatches from it to train the network. This decorrelates the data and leads to better data efficiency. In the beginning, the replay buffer is filled with random experience. 16 | - Trick 2 - Target Network: Use a separate network to estimate the TD target. This target network has the same architecture as the function approximator but with frozen parameters. Every T steps (a hyperparameter) the parameters from the Q network are copied to the target network. This leads to more stable training because it keeps the target function fixed (for a while). 17 | - By using a Convolutional Neural Network as the function approximator on raw pixels of Atari games where the score is the reward we can learn to play many of those games at human-like performance. 18 | - Double DQN: Just like regular Q-Learning, DQN tends to overestimate values due to its max operation applied to both selecting and estimating actions. We get around this by using the Q network for selection and the target network for estimation when making updates. 19 | 20 | 21 | ### Lectures & Readings 22 | 23 | **Required:** 24 | 25 | - [Human-Level Control through Deep Reinforcement Learning](http://www.readcube.com/articles/10.1038/nature14236) 26 | - [Demystifying Deep Reinforcement Learning](https://ai.intel.com/demystifying-deep-reinforcement-learning/) 27 | - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf)) 28 | 29 | **Optional:** 30 | 31 | - [Using Keras and Deep Q-Network to Play FlappyBird](https://yanpanlau.github.io/2016/07/10/FlappyBird-Keras.html) 32 | - [Deep Reinforcement Learning with Double Q-learning](http://arxiv.org/abs/1509.06461) 33 | - [Prioritized Experience Replay](http://arxiv.org/abs/1511.05952) 34 | 35 | **Deep Learning:** 36 | 37 | - [Tensorflow](http://www.tensorflow.org) 38 | - [Deep Learning Books](http://www.deeplearningbook.org/) 39 | 40 | ### Exercises 41 | 42 | - Get familiar with the [OpenAI Gym Atari Environment Playground](Breakout%20Playground.ipynb) 43 | - Deep-Q Learning for Atari Games 44 | - [Exercise](Deep%20Q%20Learning.ipynb) 45 | - [Solution](Deep%20Q%20Learning%20Solution.ipynb) 46 | - Double-Q Learning 47 | - This is a minimal change to Q-Learning so use the same exercise as above 48 | - [Solution](Double%20DQN%20Solution.ipynb) 49 | - Prioritized Experience Replay (WIP) 50 | -------------------------------------------------------------------------------- /MDP/README.md: -------------------------------------------------------------------------------- 1 | ## MDPs and Bellman Equations 2 | 3 | ### Learning Goals 4 | 5 | - Understand the Agent-Environment interface 6 | - Understand what MDPs (Markov Decision Processes) are and how to interpret transition diagrams 7 | - Understand Value Functions, Action-Value Functions, and Policy Functions 8 | - Understand the Bellman Equations and Bellman Optimality Equations for value functions and action-value functions 9 | 10 | 11 | ### Summary 12 | 13 | - Agent & Environment Interface: At each step `t` the agent receives a state `S_t`, performs an action `A_t` and receives a reward `R_{t+1}`. The action is chosen according to a policy function `pi`. 14 | - The total return `G_t` is the sum of all rewards starting from time t . Future rewards are discounted at a discount rate `gamma^k`. 15 | - Markov property: The environment's response at time `t+1` depends only on the state and action representations at time `t`. The future is independent of the past given the present. Even if an environment doesn't fully satisfy the Markov property we still treat it as if it is and try to construct the state representation to be approximately Markov. 16 | - Markov Decision Process (MDP): Defined by a state set S, action set A and one-step dynamics `p(s',r | s,a)`. If we have complete knowledge of the environment we know the transition dynamic. In practice, we often don't know the full MDP (but we know that it's some MDP). 17 | - The Value Function `v(s)` estimates how "good" it is for an agent to be in a particular state. More formally, it's the expected return `G_t` given that the agent is in state `s`. `v(s) = Ex[G_t | S_t = s]`. Note that the value function is specific to a given policy `pi`. 18 | - Action Value function: q(s, a) estimates how "good" it is for an agent to be in states and take action a. Similar to the value function, but also considers the action. 19 | - The Bellman equation expresses the relationship between the value of a state and the values of its successor states. It can be expressed using a "backup" diagram. Bellman equations exist for both the value function and the action value function. 20 | - Value functions define an ordering over policies. A policy `p1` is better than `p2` if `v_p1(s) >= v_p2(s)` for all states s. For MDPs, there exist one or more optimal policies that are better than or equal to all other policies. 21 | - The optimal state value function `v*(s)` is the value function for the optimal policy. Same for `q*(s, a)`. The Bellman Optimality Equation defines how the optimal value of a state is related to the optimal value of successor states. It has a "max" instead of an average. 22 | 23 | 24 | ### Lectures & Readings 25 | 26 | **Required:** 27 | 28 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 3: Finite Markov Decision Processes 29 | - David Silver's RL Course Lecture 2 - Markov Decision Processes ([video](https://www.youtube.com/watch?v=lfHX2hHRMVQ), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MDP.pdf)) 30 | 31 | 32 | ### Exercises 33 | 34 | This chapter is mostly theory so there are no exercises. 35 | -------------------------------------------------------------------------------- /TD/Cliff Environment Playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import gym\n", 10 | "import numpy as np\n", 11 | "import sys\n", 12 | "\n", 13 | "if \"../\" not in sys.path:\n", 14 | " sys.path.append(\"../\") \n", 15 | "\n", 16 | "from lib.envs.cliff_walking import CliffWalkingEnv" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "36\n", 29 | "o o o o o o o o o o o o\n", 30 | "o o o o o o o o o o o o\n", 31 | "o o o o o o o o o o o o\n", 32 | "x C C C C C C C C C C T\n", 33 | "\n", 34 | "(24, -1.0, False, {'prob': 1.0})\n", 35 | "o o o o o o o o o o o o\n", 36 | "o o o o o o o o o o o o\n", 37 | "x o o o o o o o o o o o\n", 38 | "o C C C C C C C C C C T\n", 39 | "\n", 40 | "(25, -1.0, False, {'prob': 1.0})\n", 41 | "o o o o o o o o o o o o\n", 42 | "o o o o o o o o o o o o\n", 43 | "o x o o o o o o o o o o\n", 44 | "o C C C C C C C C C C T\n", 45 | "\n", 46 | "(26, -1.0, False, {'prob': 1.0})\n", 47 | "o o o o o o o o o o o o\n", 48 | "o o o o o o o o o o o o\n", 49 | "o o x o o o o o o o o o\n", 50 | "o C C C C C C C C C C T\n", 51 | "\n", 52 | "(38, -100.0, True, {'prob': 1.0})\n", 53 | "o o o o o o o o o o o o\n", 54 | "o o o o o o o o o o o o\n", 55 | "o o o o o o o o o o o o\n", 56 | "o C x C C C C C C C C T\n", 57 | "\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "env = CliffWalkingEnv()\n", 63 | "\n", 64 | "print(env.reset())\n", 65 | "env.render()\n", 66 | "\n", 67 | "print(env.step(0))\n", 68 | "env.render()\n", 69 | "\n", 70 | "print(env.step(1))\n", 71 | "env.render()\n", 72 | "\n", 73 | "print(env.step(1))\n", 74 | "env.render()\n", 75 | "\n", 76 | "print(env.step(2))\n", 77 | "env.render()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": "Python 3", 91 | "language": "python", 92 | "name": "python3" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.6.4" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 1 109 | } 110 | -------------------------------------------------------------------------------- /MC/README.md: -------------------------------------------------------------------------------- 1 | ## Model-Free Prediction & Control with Monte Carlo (MC) 2 | 3 | 4 | ### Learning Goals 5 | 6 | - Understand the difference between Prediction and Control 7 | - Know how to use the MC method for predicting state values and state-action values 8 | - Understand the on-policy first-visit MC control algorithm 9 | - Understand off-policy MC control algorithms 10 | - Understand Weighted Importance Sampling 11 | - Understand the benefits of MC algorithms over the Dynamic Programming approach 12 | 13 | 14 | ### Summary 15 | 16 | - Dynamic Programming approaches assume complete knowledge of the environment (the MDP). In practice, we often don't have full knowledge of how the world works. 17 | - Monte Carlo (MC) methods can learn directly from experience collected by interacting with the environment. An episode of experience is a series of `(State, Action, Reward, Next State)` tuples. 18 | - MC methods work based on episodes. We sample episodes of experience and make updates to our estimates at the end of each episode. MC methods have high variance (due to lots of random decisions within an episode) but are unbiased. 19 | - MC Policy Evaluation: Given a policy, we want to estimate the state-value function V(s). Sample episodes of experience and estimate V(s) to be the reward received from that state onwards averaged across all of your experience. The same technique works for the action-value function Q(s, a). Given enough samples, this is proven to converge. 20 | - MC Control: Idea is the same as for Dynamic Programming. Use MC Policy Evaluation to evaluate the current policy then improve the policy greedily. The Problem: How do we ensure that we explore all states if we don't know the full environment? 21 | - Solution to exploration problem: Use epsilon-greedy policies instead of full greedy policies. When making a decision act randomly with probability epsilon. This will learn the optimal epsilon-greedy policy. 22 | - Off-Policy Learning: How can we learn about the actual optimal (greedy) policy while following an exploratory (epsilon-greedy) policy? We can use importance sampling, which weighs returns by their probability of occurring under the policy we want to learn about. 23 | 24 | 25 | ### Lectures & Readings 26 | 27 | **Required:** 28 | 29 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 5: Monte Carlo Methods 30 | 31 | 32 | **Optional:** 33 | 34 | - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf)) 35 | - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf)) 36 | 37 | 38 | ### Exercises 39 | 40 | - Get familiar with the [Blackjack environment (Blackjack-v0)](Blackjack%20Playground.ipynb) 41 | - Implement the Monte Carlo Prediction to estimate state-action values 42 | - [Exercise](MC%20Prediction.ipynb) 43 | - [Solution](MC%20Prediction%20Solution.ipynb) 44 | - Implement the on-policy first-visit Monte Carlo Control algorithm 45 | - [Exercise](MC%20Control%20with%20Epsilon-Greedy%20Policies.ipynb) 46 | - [Solution](MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb) 47 | - Implement the off-policy every-visit Monte Carlo Control using Weighted Important Sampling algorithm 48 | - [Exercise](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling.ipynb) 49 | - [Solution](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb) 50 | -------------------------------------------------------------------------------- /PolicyGradient/a3c/worker_test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import sys 3 | import os 4 | import itertools 5 | import collections 6 | import unittest 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | from inspect import getsourcefile 11 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0))) 12 | import_path = os.path.abspath(os.path.join(current_path, "../..")) 13 | 14 | if import_path not in sys.path: 15 | sys.path.append(import_path) 16 | 17 | # from lib import plotting 18 | from lib.atari.state_processor import StateProcessor 19 | from lib.atari import helpers as atari_helpers 20 | from worker import Worker 21 | from estimators import ValueEstimator, PolicyEstimator 22 | 23 | def make_env(): 24 | return gym.envs.make("Breakout-v0") 25 | 26 | VALID_ACTIONS = [0, 1, 2, 3] 27 | 28 | class WorkerTest(tf.test.TestCase): 29 | def setUp(self): 30 | super(WorkerTest, self).setUp() 31 | 32 | self.env = make_env() 33 | self.discount_factor = 0.99 34 | self.global_step = tf.Variable(0, name="global_step", trainable=False) 35 | self.global_counter = itertools.count() 36 | self.sp = StateProcessor() 37 | 38 | with tf.variable_scope("global") as vs: 39 | self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS)) 40 | self.global_value_net = ValueEstimator(reuse=True) 41 | 42 | def testPolicyNetPredict(self): 43 | w = Worker( 44 | name="test", 45 | env=make_env(), 46 | policy_net=self.global_policy_net, 47 | value_net=self.global_value_net, 48 | global_counter=self.global_counter, 49 | discount_factor=self.discount_factor) 50 | 51 | with self.test_session() as sess: 52 | sess.run(tf.initialize_all_variables()) 53 | state = self.sp.process(self.env.reset()) 54 | processed_state = atari_helpers.atari_make_initial_state(state) 55 | action_values = w._policy_net_predict(processed_state, sess) 56 | self.assertEqual(action_values.shape, (4,)) 57 | 58 | 59 | def testValueNetPredict(self): 60 | w = Worker( 61 | name="test", 62 | env=make_env(), 63 | policy_net=self.global_policy_net, 64 | value_net=self.global_value_net, 65 | global_counter=self.global_counter, 66 | discount_factor=self.discount_factor) 67 | 68 | with self.test_session() as sess: 69 | sess.run(tf.initialize_all_variables()) 70 | state = self.sp.process(self.env.reset()) 71 | processed_state = atari_helpers.atari_make_initial_state(state) 72 | state_value = w._value_net_predict(processed_state, sess) 73 | self.assertEqual(state_value.shape, ()) 74 | 75 | def testRunNStepsAndUpdate(self): 76 | w = Worker( 77 | name="test", 78 | env=make_env(), 79 | policy_net=self.global_policy_net, 80 | value_net=self.global_value_net, 81 | global_counter=self.global_counter, 82 | discount_factor=self.discount_factor) 83 | 84 | with self.test_session() as sess: 85 | sess.run(tf.initialize_all_variables()) 86 | state = self.sp.process(self.env.reset()) 87 | processed_state = atari_helpers.atari_make_initial_state(state) 88 | w.state = processed_state 89 | transitions, local_t, global_t = w.run_n_steps(10, sess) 90 | policy_net_loss, value_net_loss, policy_net_summaries, value_net_summaries = w.update(transitions, sess) 91 | 92 | self.assertEqual(len(transitions), 10) 93 | self.assertIsNotNone(policy_net_loss) 94 | self.assertIsNotNone(value_net_loss) 95 | self.assertIsNotNone(policy_net_summaries) 96 | self.assertIsNotNone(value_net_summaries) 97 | 98 | 99 | if __name__ == '__main__': 100 | unittest.main() -------------------------------------------------------------------------------- /lib/plotting.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import numpy as np 3 | import pandas as pd 4 | from collections import namedtuple 5 | from matplotlib import pyplot as plt 6 | from mpl_toolkits.mplot3d import Axes3D 7 | 8 | EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"]) 9 | 10 | def plot_cost_to_go_mountain_car(env, estimator, num_tiles=20): 11 | x = np.linspace(env.observation_space.low[0], env.observation_space.high[0], num=num_tiles) 12 | y = np.linspace(env.observation_space.low[1], env.observation_space.high[1], num=num_tiles) 13 | X, Y = np.meshgrid(x, y) 14 | Z = np.apply_along_axis(lambda _: -np.max(estimator.predict(_)), 2, np.dstack([X, Y])) 15 | 16 | fig = plt.figure(figsize=(10, 5)) 17 | ax = fig.add_subplot(111, projection='3d') 18 | surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, 19 | cmap=matplotlib.cm.coolwarm, vmin=-1.0, vmax=1.0) 20 | ax.set_xlabel('Position') 21 | ax.set_ylabel('Velocity') 22 | ax.set_zlabel('Value') 23 | ax.set_title("Mountain \"Cost To Go\" Function") 24 | fig.colorbar(surf) 25 | plt.show() 26 | 27 | 28 | def plot_value_function(V, title="Value Function"): 29 | """ 30 | Plots the value function as a surface plot. 31 | """ 32 | min_x = min(k[0] for k in V.keys()) 33 | max_x = max(k[0] for k in V.keys()) 34 | min_y = min(k[1] for k in V.keys()) 35 | max_y = max(k[1] for k in V.keys()) 36 | 37 | x_range = np.arange(min_x, max_x + 1) 38 | y_range = np.arange(min_y, max_y + 1) 39 | X, Y = np.meshgrid(x_range, y_range) 40 | 41 | # Find value for all (x, y) coordinates 42 | Z_noace = np.apply_along_axis(lambda _: V[(_[0], _[1], False)], 2, np.dstack([X, Y])) 43 | Z_ace = np.apply_along_axis(lambda _: V[(_[0], _[1], True)], 2, np.dstack([X, Y])) 44 | 45 | def plot_surface(X, Y, Z, title): 46 | fig = plt.figure(figsize=(20, 10)) 47 | ax = fig.add_subplot(111, projection='3d') 48 | surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, 49 | cmap=matplotlib.cm.coolwarm, vmin=-1.0, vmax=1.0) 50 | ax.set_xlabel('Player Sum') 51 | ax.set_ylabel('Dealer Showing') 52 | ax.set_zlabel('Value') 53 | ax.set_title(title) 54 | ax.view_init(ax.elev, -120) 55 | fig.colorbar(surf) 56 | plt.show() 57 | 58 | plot_surface(X, Y, Z_noace, "{} (No Usable Ace)".format(title)) 59 | plot_surface(X, Y, Z_ace, "{} (Usable Ace)".format(title)) 60 | 61 | 62 | 63 | def plot_episode_stats(stats, smoothing_window=10, noshow=False): 64 | # Plot the episode length over time 65 | fig1 = plt.figure(figsize=(10,5)) 66 | plt.plot(stats.episode_lengths) 67 | plt.xlabel("Episode") 68 | plt.ylabel("Episode Length") 69 | plt.title("Episode Length over Time") 70 | if noshow: 71 | plt.close(fig1) 72 | else: 73 | plt.show(fig1) 74 | 75 | # Plot the episode reward over time 76 | fig2 = plt.figure(figsize=(10,5)) 77 | rewards_smoothed = pd.Series(stats.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean() 78 | plt.plot(rewards_smoothed) 79 | plt.xlabel("Episode") 80 | plt.ylabel("Episode Reward (Smoothed)") 81 | plt.title("Episode Reward over Time (Smoothed over window size {})".format(smoothing_window)) 82 | if noshow: 83 | plt.close(fig2) 84 | else: 85 | plt.show(fig2) 86 | 87 | # Plot time steps and episode number 88 | fig3 = plt.figure(figsize=(10,5)) 89 | plt.plot(np.cumsum(stats.episode_lengths), np.arange(len(stats.episode_lengths))) 90 | plt.xlabel("Time Steps") 91 | plt.ylabel("Episode") 92 | plt.title("Episode per time step") 93 | if noshow: 94 | plt.close(fig3) 95 | else: 96 | plt.show(fig3) 97 | 98 | return fig1, fig2, fig3 99 | -------------------------------------------------------------------------------- /MC/MC Prediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "\n", 13 | "import gym\n", 14 | "import matplotlib\n", 15 | "import numpy as np\n", 16 | "import sys\n", 17 | "\n", 18 | "from collections import defaultdict\n", 19 | "\n", 20 | "if \"../\" not in sys.path:\n", 21 | " sys.path.append(\"../\") \n", 22 | "from lib.envs.blackjack import BlackjackEnv\n", 23 | "from lib import plotting\n", 24 | "\n", 25 | "matplotlib.style.use('ggplot')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "env = BlackjackEnv()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n", 48 | " \"\"\"\n", 49 | " Monte Carlo prediction algorithm. Calculates the value function\n", 50 | " for a given policy using sampling.\n", 51 | " \n", 52 | " Args:\n", 53 | " policy: A function that maps an observation to action probabilities.\n", 54 | " env: OpenAI gym environment.\n", 55 | " num_episodes: Number of episodes to sample.\n", 56 | " discount_factor: Gamma discount factor.\n", 57 | " \n", 58 | " Returns:\n", 59 | " A dictionary that maps from state -> value.\n", 60 | " The state is a tuple and the value is a float.\n", 61 | " \"\"\"\n", 62 | "\n", 63 | " # Keeps track of sum and count of returns for each state\n", 64 | " # to calculate an average. We could use an array to save all\n", 65 | " # returns (like in the book) but that's memory inefficient.\n", 66 | " returns_sum = defaultdict(float)\n", 67 | " returns_count = defaultdict(float)\n", 68 | " \n", 69 | " # The final value function\n", 70 | " V = defaultdict(float)\n", 71 | " \n", 72 | " # Implement this!\n", 73 | "\n", 74 | " return V " 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "def sample_policy(observation):\n", 86 | " \"\"\"\n", 87 | " A policy that sticks if the player score is > 20 and hits otherwise.\n", 88 | " \"\"\"\n", 89 | " score, dealer_score, usable_ace = observation\n", 90 | " return 0 if score >= 20 else 1" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true, 98 | "scrolled": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "V_10k = mc_prediction(sample_policy, env, num_episodes=10000)\n", 103 | "plotting.plot_value_function(V_10k, title=\"10,000 Steps\")\n", 104 | "\n", 105 | "V_500k = mc_prediction(sample_policy, env, num_episodes=500000)\n", 106 | "plotting.plot_value_function(V_500k, title=\"500,000 Steps\")" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [], 116 | "source": [] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "Python 3", 122 | "language": "python", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.5.2" 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 1 140 | } 141 | -------------------------------------------------------------------------------- /PolicyGradient/a3c/policy_monitor.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import itertools 4 | import collections 5 | import numpy as np 6 | import tensorflow as tf 7 | import time 8 | 9 | from inspect import getsourcefile 10 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0))) 11 | import_path = os.path.abspath(os.path.join(current_path, "../..")) 12 | 13 | if import_path not in sys.path: 14 | sys.path.append(import_path) 15 | 16 | from gym.wrappers import Monitor 17 | import gym 18 | 19 | from lib.atari.state_processor import StateProcessor 20 | from lib.atari import helpers as atari_helpers 21 | from estimators import ValueEstimator, PolicyEstimator 22 | from worker import make_copy_params_op 23 | 24 | 25 | class PolicyMonitor(object): 26 | """ 27 | Helps evaluating a policy by running an episode in an environment, 28 | saving a video, and plotting summaries to Tensorboard. 29 | 30 | Args: 31 | env: environment to run in 32 | policy_net: A policy estimator 33 | summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries 34 | """ 35 | def __init__(self, env, policy_net, summary_writer, saver=None): 36 | 37 | self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") 38 | self.video_dir = os.path.abspath(self.video_dir) 39 | 40 | self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True) 41 | self.global_policy_net = policy_net 42 | self.summary_writer = summary_writer 43 | self.saver = saver 44 | self.sp = StateProcessor() 45 | 46 | self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) 47 | 48 | try: 49 | os.makedirs(self.video_dir) 50 | except FileExistsError: 51 | pass 52 | 53 | # Local policy net 54 | with tf.variable_scope("policy_eval"): 55 | self.policy_net = PolicyEstimator(policy_net.num_outputs) 56 | 57 | # Op to copy params from global policy/value net parameters 58 | self.copy_params_op = make_copy_params_op( 59 | tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), 60 | tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES)) 61 | 62 | def _policy_net_predict(self, state, sess): 63 | feed_dict = { self.policy_net.states: [state] } 64 | preds = sess.run(self.policy_net.predictions, feed_dict) 65 | return preds["probs"][0] 66 | 67 | def eval_once(self, sess): 68 | with sess.as_default(), sess.graph.as_default(): 69 | # Copy params to local model 70 | global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op]) 71 | 72 | # Run an episode 73 | done = False 74 | state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) 75 | total_reward = 0.0 76 | episode_length = 0 77 | while not done: 78 | action_probs = self._policy_net_predict(state, sess) 79 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 80 | next_state, reward, done, _ = self.env.step(action) 81 | next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state)) 82 | total_reward += reward 83 | episode_length += 1 84 | state = next_state 85 | 86 | # Add summaries 87 | episode_summary = tf.Summary() 88 | episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward") 89 | episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length") 90 | self.summary_writer.add_summary(episode_summary, global_step) 91 | self.summary_writer.flush() 92 | 93 | if self.saver is not None: 94 | self.saver.save(sess, self.checkpoint_path) 95 | 96 | tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length)) 97 | 98 | return total_reward, episode_length 99 | 100 | def continuous_eval(self, eval_every, sess, coord): 101 | """ 102 | Continuously evaluates the policy every [eval_every] seconds. 103 | """ 104 | try: 105 | while not coord.should_stop(): 106 | self.eval_once(sess) 107 | # Sleep until next evaluation cycle 108 | time.sleep(eval_every) 109 | except tf.errors.CancelledError: 110 | return 111 | -------------------------------------------------------------------------------- /lib/envs/gridworld.py: -------------------------------------------------------------------------------- 1 | import io 2 | import numpy as np 3 | import sys 4 | 5 | from . import discrete 6 | 7 | UP = 0 8 | RIGHT = 1 9 | DOWN = 2 10 | LEFT = 3 11 | 12 | class GridworldEnv(discrete.DiscreteEnv): 13 | """ 14 | Grid World environment from Sutton's Reinforcement Learning book chapter 4. 15 | You are an agent on an MxN grid and your goal is to reach the terminal 16 | state at the top left or the bottom right corner. 17 | 18 | For example, a 4x4 grid looks as follows: 19 | 20 | T o o o 21 | o x o o 22 | o o o o 23 | o o o T 24 | 25 | x is your position and T are the two terminal states. 26 | 27 | You can take actions in each direction (UP=0, RIGHT=1, DOWN=2, LEFT=3). 28 | Actions going off the edge leave you in your current state. 29 | You receive a reward of -1 at each step until you reach a terminal state. 30 | """ 31 | 32 | metadata = {'render.modes': ['human', 'ansi']} 33 | 34 | def __init__(self, shape=[4,4]): 35 | if not isinstance(shape, (list, tuple)) or not len(shape) == 2: 36 | raise ValueError('shape argument must be a list/tuple of length 2') 37 | 38 | self.shape = shape 39 | 40 | nS = np.prod(shape) 41 | nA = 4 42 | 43 | MAX_Y = shape[0] 44 | MAX_X = shape[1] 45 | 46 | P = {} 47 | grid = np.arange(nS).reshape(shape) 48 | it = np.nditer(grid, flags=['multi_index']) 49 | 50 | while not it.finished: 51 | s = it.iterindex 52 | y, x = it.multi_index 53 | 54 | # P[s][a] = (prob, next_state, reward, is_done) 55 | P[s] = {a : [] for a in range(nA)} 56 | 57 | is_done = lambda s: s == 0 or s == (nS - 1) 58 | reward = 0.0 if is_done(s) else -1.0 59 | 60 | # We're stuck in a terminal state 61 | if is_done(s): 62 | P[s][UP] = [(1.0, s, reward, True)] 63 | P[s][RIGHT] = [(1.0, s, reward, True)] 64 | P[s][DOWN] = [(1.0, s, reward, True)] 65 | P[s][LEFT] = [(1.0, s, reward, True)] 66 | # Not a terminal state 67 | else: 68 | ns_up = s if y == 0 else s - MAX_X 69 | ns_right = s if x == (MAX_X - 1) else s + 1 70 | ns_down = s if y == (MAX_Y - 1) else s + MAX_X 71 | ns_left = s if x == 0 else s - 1 72 | P[s][UP] = [(1.0, ns_up, reward, is_done(ns_up))] 73 | P[s][RIGHT] = [(1.0, ns_right, reward, is_done(ns_right))] 74 | P[s][DOWN] = [(1.0, ns_down, reward, is_done(ns_down))] 75 | P[s][LEFT] = [(1.0, ns_left, reward, is_done(ns_left))] 76 | 77 | it.iternext() 78 | 79 | # Initial state distribution is uniform 80 | isd = np.ones(nS) / nS 81 | 82 | # We expose the model of the environment for educational purposes 83 | # This should not be used in any model-free learning algorithm 84 | self.P = P 85 | 86 | super(GridworldEnv, self).__init__(nS, nA, P, isd) 87 | 88 | def _render(self, mode='human', close=False): 89 | """ Renders the current gridworld layout 90 | 91 | For example, a 4x4 grid with the mode="human" looks like: 92 | T o o o 93 | o x o o 94 | o o o o 95 | o o o T 96 | where x is your position and T are the two terminal states. 97 | """ 98 | if close: 99 | return 100 | 101 | outfile = io.StringIO() if mode == 'ansi' else sys.stdout 102 | 103 | grid = np.arange(self.nS).reshape(self.shape) 104 | it = np.nditer(grid, flags=['multi_index']) 105 | while not it.finished: 106 | s = it.iterindex 107 | y, x = it.multi_index 108 | 109 | if self.s == s: 110 | output = " x " 111 | elif s == 0 or s == self.nS - 1: 112 | output = " T " 113 | else: 114 | output = " o " 115 | 116 | if x == 0: 117 | output = output.lstrip() 118 | if x == self.shape[1] - 1: 119 | output = output.rstrip() 120 | 121 | outfile.write(output) 122 | 123 | if x == self.shape[1] - 1: 124 | outfile.write("\n") 125 | 126 | it.iternext() 127 | -------------------------------------------------------------------------------- /PolicyGradient/a3c/estimator_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import gym 3 | import sys 4 | import os 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from inspect import getsourcefile 9 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0))) 10 | import_path = os.path.abspath(os.path.join(current_path, "../..")) 11 | 12 | if import_path not in sys.path: 13 | sys.path.append(import_path) 14 | 15 | # from lib import plotting 16 | from lib.atari.state_processor import StateProcessor 17 | from lib.atari import helpers as atari_helpers 18 | from estimators import ValueEstimator, PolicyEstimator 19 | 20 | 21 | def make_env(): 22 | return gym.envs.make("Breakout-v0") 23 | 24 | VALID_ACTIONS = [0, 1, 2, 3] 25 | 26 | class PolicyEstimatorTest(tf.test.TestCase): 27 | def testPredict(self): 28 | env = make_env() 29 | sp = StateProcessor() 30 | estimator = PolicyEstimator(len(VALID_ACTIONS)) 31 | 32 | with self.test_session() as sess: 33 | sess.run(tf.initialize_all_variables()) 34 | 35 | # Generate a state 36 | state = sp.process(env.reset()) 37 | processed_state = atari_helpers.atari_make_initial_state(state) 38 | processed_states = np.array([processed_state]) 39 | 40 | # Run feeds 41 | feed_dict = { 42 | estimator.states: processed_states, 43 | estimator.targets: [1.0], 44 | estimator.actions: [1] 45 | } 46 | loss = sess.run(estimator.loss, feed_dict) 47 | pred = sess.run(estimator.predictions, feed_dict) 48 | 49 | # Assertions 50 | self.assertTrue(loss != 0.0) 51 | self.assertEqual(pred["probs"].shape, (1, len(VALID_ACTIONS))) 52 | self.assertEqual(pred["logits"].shape, (1, len(VALID_ACTIONS))) 53 | 54 | def testGradient(self): 55 | env = make_env() 56 | sp = StateProcessor() 57 | estimator = PolicyEstimator(len(VALID_ACTIONS)) 58 | grads = [g for g, _ in estimator.grads_and_vars] 59 | 60 | with self.test_session() as sess: 61 | sess.run(tf.initialize_all_variables()) 62 | 63 | # Generate a state 64 | state = sp.process(env.reset()) 65 | processed_state = atari_helpers.atari_make_initial_state(state) 66 | processed_states = np.array([processed_state]) 67 | 68 | # Run feeds to get gradients 69 | feed_dict = { 70 | estimator.states: processed_states, 71 | estimator.targets: [1.0], 72 | estimator.actions: [1] 73 | } 74 | grads_ = sess.run(grads, feed_dict) 75 | 76 | # Apply calculated gradients 77 | grad_feed_dict = { k: v for k, v in zip(grads, grads_) } 78 | _ = sess.run(estimator.train_op, grad_feed_dict) 79 | 80 | 81 | class ValueEstimatorTest(tf.test.TestCase): 82 | def testPredict(self): 83 | env = make_env() 84 | sp = StateProcessor() 85 | estimator = ValueEstimator() 86 | 87 | with self.test_session() as sess: 88 | sess.run(tf.initialize_all_variables()) 89 | 90 | # Generate a state 91 | state = sp.process(env.reset()) 92 | processed_state = atari_helpers.atari_make_initial_state(state) 93 | processed_states = np.array([processed_state]) 94 | 95 | # Run feeds 96 | feed_dict = { 97 | estimator.states: processed_states, 98 | estimator.targets: [1.0], 99 | } 100 | loss = sess.run(estimator.loss, feed_dict) 101 | pred = sess.run(estimator.predictions, feed_dict) 102 | 103 | # Assertions 104 | self.assertTrue(loss != 0.0) 105 | self.assertEqual(pred["logits"].shape, (1,)) 106 | 107 | def testGradient(self): 108 | env = make_env() 109 | sp = StateProcessor() 110 | estimator = ValueEstimator() 111 | grads = [g for g, _ in estimator.grads_and_vars] 112 | 113 | with self.test_session() as sess: 114 | sess.run(tf.initialize_all_variables()) 115 | 116 | # Generate a state 117 | state = sp.process(env.reset()) 118 | processed_state = atari_helpers.atari_make_initial_state(state) 119 | processed_states = np.array([processed_state]) 120 | 121 | # Run feeds 122 | feed_dict = { 123 | estimator.states: processed_states, 124 | estimator.targets: [1.0], 125 | } 126 | grads_ = sess.run(grads, feed_dict) 127 | 128 | # Apply calculated gradients 129 | grad_feed_dict = { k: v for k, v in zip(grads, grads_) } 130 | _ = sess.run(estimator.train_op, grad_feed_dict) 131 | 132 | if __name__ == '__main__': 133 | unittest.main() -------------------------------------------------------------------------------- /TD/Windy Gridworld Playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import gym\n", 10 | "import numpy as np\n", 11 | "import sys\n", 12 | "\n", 13 | "if \"../\" not in sys.path:\n", 14 | " sys.path.append(\"../\") \n", 15 | "\n", 16 | "from lib.envs.windy_gridworld import WindyGridworldEnv" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "30\n", 29 | "o o o o o o o o o o\n", 30 | "o o o o o o o o o o\n", 31 | "o o o o o o o o o o\n", 32 | "x o o o o o o T o o\n", 33 | "o o o o o o o o o o\n", 34 | "o o o o o o o o o o\n", 35 | "o o o o o o o o o o\n", 36 | "\n", 37 | "(31, -1.0, False, {'prob': 1.0})\n", 38 | "o o o o o o o o o o\n", 39 | "o o o o o o o o o o\n", 40 | "o o o o o o o o o o\n", 41 | "o x o o o o o T o o\n", 42 | "o o o o o o o o o o\n", 43 | "o o o o o o o o o o\n", 44 | "o o o o o o o o o o\n", 45 | "\n", 46 | "(32, -1.0, False, {'prob': 1.0})\n", 47 | "o o o o o o o o o o\n", 48 | "o o o o o o o o o o\n", 49 | "o o o o o o o o o o\n", 50 | "o o x o o o o T o o\n", 51 | "o o o o o o o o o o\n", 52 | "o o o o o o o o o o\n", 53 | "o o o o o o o o o o\n", 54 | "\n", 55 | "(33, -1.0, False, {'prob': 1.0})\n", 56 | "o o o o o o o o o o\n", 57 | "o o o o o o o o o o\n", 58 | "o o o o o o o o o o\n", 59 | "o o o x o o o T o o\n", 60 | "o o o o o o o o o o\n", 61 | "o o o o o o o o o o\n", 62 | "o o o o o o o o o o\n", 63 | "\n", 64 | "(33, -1.0, False, {'prob': 1.0})\n", 65 | "o o o o o o o o o o\n", 66 | "o o o o o o o o o o\n", 67 | "o o o o o o o o o o\n", 68 | "o o o x o o o T o o\n", 69 | "o o o o o o o o o o\n", 70 | "o o o o o o o o o o\n", 71 | "o o o o o o o o o o\n", 72 | "\n", 73 | "(24, -1.0, False, {'prob': 1.0})\n", 74 | "o o o o o o o o o o\n", 75 | "o o o o o o o o o o\n", 76 | "o o o o x o o o o o\n", 77 | "o o o o o o o T o o\n", 78 | "o o o o o o o o o o\n", 79 | "o o o o o o o o o o\n", 80 | "o o o o o o o o o o\n", 81 | "\n", 82 | "(15, -1.0, False, {'prob': 1.0})\n", 83 | "o o o o o o o o o o\n", 84 | "o o o o o x o o o o\n", 85 | "o o o o o o o o o o\n", 86 | "o o o o o o o T o o\n", 87 | "o o o o o o o o o o\n", 88 | "o o o o o o o o o o\n", 89 | "o o o o o o o o o o\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "env = WindyGridworldEnv()\n", 96 | "\n", 97 | "print(env.reset())\n", 98 | "env.render()\n", 99 | "\n", 100 | "print(env.step(1))\n", 101 | "env.render()\n", 102 | "\n", 103 | "print(env.step(1))\n", 104 | "env.render()\n", 105 | "\n", 106 | "print(env.step(1))\n", 107 | "env.render()\n", 108 | "\n", 109 | "print(env.step(2))\n", 110 | "env.render()\n", 111 | "\n", 112 | "print(env.step(1))\n", 113 | "env.render()\n", 114 | "\n", 115 | "print(env.step(1))\n", 116 | "env.render()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | } 126 | ], 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "Python 3", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.6.4" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 1 148 | } 149 | -------------------------------------------------------------------------------- /lib/envs/blackjack.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | from gym.utils import seeding 4 | 5 | def cmp(a, b): 6 | return int((a > b)) - int((a < b)) 7 | 8 | # 1 = Ace, 2-10 = Number cards, Jack/Queen/King = 10 9 | deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] 10 | 11 | 12 | def draw_card(np_random): 13 | return np_random.choice(deck) 14 | 15 | 16 | def draw_hand(np_random): 17 | return [draw_card(np_random), draw_card(np_random)] 18 | 19 | 20 | def usable_ace(hand): # Does this hand have a usable ace? 21 | return 1 in hand and sum(hand) + 10 <= 21 22 | 23 | 24 | def sum_hand(hand): # Return current hand total 25 | if usable_ace(hand): 26 | return sum(hand) + 10 27 | return sum(hand) 28 | 29 | 30 | def is_bust(hand): # Is this hand a bust? 31 | return sum_hand(hand) > 21 32 | 33 | 34 | def score(hand): # What is the score of this hand (0 if bust) 35 | return 0 if is_bust(hand) else sum_hand(hand) 36 | 37 | 38 | def is_natural(hand): # Is this hand a natural blackjack? 39 | return sorted(hand) == [1, 10] 40 | 41 | 42 | class BlackjackEnv(gym.Env): 43 | """Simple blackjack environment 44 | Blackjack is a card game where the goal is to obtain cards that sum to as 45 | near as possible to 21 without going over. They're playing against a fixed 46 | dealer. 47 | Face cards (Jack, Queen, King) have point value 10. 48 | Aces can either count as 11 or 1, and it's called 'usable' at 11. 49 | This game is placed with an infinite deck (or with replacement). 50 | The game starts with each (player and dealer) having one face up and one 51 | face down card. 52 | The player can request additional cards (hit=1) until they decide to stop 53 | (stick=0) or exceed 21 (bust). 54 | After the player sticks, the dealer reveals their facedown card, and draws 55 | until their sum is 17 or greater. If the dealer goes bust the player wins. 56 | If neither player nor dealer busts, the outcome (win, lose, draw) is 57 | decided by whose sum is closer to 21. The reward for winning is +1, 58 | drawing is 0, and losing is -1. 59 | The observation of a 3-tuple of: the players current sum, 60 | the dealer's one showing card (1-10 where 1 is ace), 61 | and whether or not the player holds a usable ace (0 or 1). 62 | This environment corresponds to the version of the blackjack problem 63 | described in Example 5.1 in Reinforcement Learning: An Introduction 64 | by Sutton and Barto (1998). 65 | https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html 66 | """ 67 | def __init__(self, natural=False): 68 | self.action_space = spaces.Discrete(2) 69 | self.observation_space = spaces.Tuple(( 70 | spaces.Discrete(32), 71 | spaces.Discrete(11), 72 | spaces.Discrete(2))) 73 | self._seed() 74 | 75 | # Flag to payout 1.5 on a "natural" blackjack win, like casino rules 76 | # Ref: http://www.bicyclecards.com/how-to-play/blackjack/ 77 | self.natural = natural 78 | # Start the first game 79 | self._reset() # Number of 80 | self.nA = 2 81 | 82 | def reset(self): 83 | return self._reset() 84 | 85 | def step(self, action): 86 | return self._step(action) 87 | 88 | def _seed(self, seed=None): 89 | self.np_random, seed = seeding.np_random(seed) 90 | return [seed] 91 | 92 | def _step(self, action): 93 | assert self.action_space.contains(action) 94 | if action: # hit: add a card to players hand and return 95 | self.player.append(draw_card(self.np_random)) 96 | if is_bust(self.player): 97 | done = True 98 | reward = -1 99 | else: 100 | done = False 101 | reward = 0 102 | else: # stick: play out the dealers hand, and score 103 | done = True 104 | while sum_hand(self.dealer) < 17: 105 | self.dealer.append(draw_card(self.np_random)) 106 | reward = cmp(score(self.player), score(self.dealer)) 107 | if self.natural and is_natural(self.player) and reward == 1: 108 | reward = 1.5 109 | return self._get_obs(), reward, done, {} 110 | 111 | def _get_obs(self): 112 | return (sum_hand(self.player), self.dealer[0], usable_ace(self.player)) 113 | 114 | def _reset(self): 115 | self.dealer = draw_hand(self.np_random) 116 | self.player = draw_hand(self.np_random) 117 | 118 | # Auto-draw another card if the score is less than 12 119 | while sum_hand(self.player) < 12: 120 | self.player.append(draw_card(self.np_random)) 121 | 122 | return self._get_obs() 123 | -------------------------------------------------------------------------------- /PolicyGradient/README.md: -------------------------------------------------------------------------------- 1 | ## Policy Gradient Methods 2 | 3 | 4 | ### Learning Goals 5 | 6 | - Understand the difference between value-based and policy-based Reinforcement Learning 7 | - Understand the REINFORCE Algorithm (Monte Carlo Policy Gradient) 8 | - Understand Actor-Critic (AC) algorithms 9 | - Understand Advantage Functions 10 | - Understand Deterministic Policy Gradients (Optional) 11 | - Understand how to scale up Policy Gradient methods using asynchronous actor-critic and Neural Networks (Optional) 12 | 13 | 14 | ### Summary 15 | 16 | - Idea: Instead of parameterizing the value function and doing greedy policy improvement we parameterize the policy and do gradient descent into a direction that improves it. 17 | - Sometimes the policy is easier to approximate than the value function. Also, we need a parameterized policy to deal with continuous action spaces and environments where we need to act stochastically. 18 | - Policy Score Function `J(theta)`: Intuitively, it measures how good our policy is. For example, we can use the average value or average reward under a policy as our objective. 19 | - Common choices for the policy function: Softmax for discrete actions, Gaussian parameters for continuous actions. 20 | - Policy Gradient Theorem: `grad(J(theta)) = Ex[grad(log(pi(s, a))) * Q(s, a)]`. Basically, we move our policy into a direction of more reward. 21 | - REINFORCE (Monte Carlo Policy Gradient): We substitute a samples return `g_t` form an episode for Q(s, a) to make an update. Unbiased but high variance. 22 | - Baseline: Instead of measuring the absolute goodness of an action we want to know how much better than "average" it is to take an action given a state. E.g. some states are naturally bad and always give negative reward. This is called the advantage and is defined as `Q(s, a) - V(s)`. We use that for our policy update, e.g. `g_t - V(s)` for REINFORCE. 23 | - Actor-Critic: Instead of waiting until the end of an episode as in REINFORCE we use bootstrapping and make an update at each step. To do that we also train a Critic Q(theta) that approximates the value function. Now we have two function approximators: One of the policy, one for the critic. This is basically TD, but for Policy Gradients. 24 | - A good estimate of the advantage function in the Actor-Critic algorithm is the td error. Our update then becomes `grad(J(theta)) = Ex[grad(log(pi(s, a))) * td_error]`. 25 | - Can use policy gradients with td-lambda, eligibility traces, and so on. 26 | - Deterministic Policy Gradients: Useful for high-dimensional continuous action spaces where stochastic policy gradients are expensive to compute. The idea is to update the policy in the direction of the gradient of the action-value function. To ensure exploration we can use an off-policy actor-critic algorithm with added noise in action selection. 27 | - Deep Deterministic Policy Gradients: Apply tricks from DQN to Deterministic Policy Gradients ;) 28 | - Asynchronous Advantage Actor-Critic (A3C): Instead of using an experience replay buffer as in DQN use multiple agents on different threads to explore the state spaces and make decorrelated updates to the actor and the critic. 29 | 30 | 31 | ### Lectures & Readings 32 | 33 | **Required:** 34 | 35 | - David Silver's RL Course Lecture 7 - Policy Gradient Methods ([video](https://www.youtube.com/watch?v=KHZVXao4qXs), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/pg.pdf)) 36 | 37 | **Optional:** 38 | 39 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 13: Policy Gradient Methods 40 | - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf) 41 | - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/) 42 | - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971) 43 | - [Deep Deterministic Policy Gradients in TensorFlow](http://pemami4911.github.io/blog_posts/2016/08/21/ddpg-rl.html) 44 | - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783) 45 | - [Deep Reinforcement Learning: A Tutorial (Policy Gradient Section)](http://web.archive.org/web/20161029135055/https://gym.openai.com/docs/rl#id16) 46 | 47 | 48 | 49 | ### Exercises 50 | 51 | - REINFORCE with Baseline 52 | - Exercise 53 | - [Solution](CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb) 54 | - Actor-Critic with Baseline 55 | - Exercise 56 | - [Solution](CliffWalk%20Actor%20Critic%20Solution.ipynb) 57 | - Actor-Critic with Baseline for Continuous Action Spaces 58 | - Exercise 59 | - [Solution](Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb) 60 | - Deterministic Policy Gradients for Continuous Action Spaces (WIP) 61 | - Deep Deterministic Policy Gradients (WIP) 62 | - Asynchronous Advantage Actor-Critic (A3C) 63 | - Exercise 64 | - [Solution](a3c/) 65 | -------------------------------------------------------------------------------- /PolicyGradient/a3c/train.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import unittest 4 | import gym 5 | import sys 6 | import os 7 | import numpy as np 8 | import tensorflow as tf 9 | import itertools 10 | import shutil 11 | import threading 12 | import multiprocessing 13 | 14 | from inspect import getsourcefile 15 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0))) 16 | import_path = os.path.abspath(os.path.join(current_path, "../..")) 17 | 18 | if import_path not in sys.path: 19 | sys.path.append(import_path) 20 | 21 | from lib.atari import helpers as atari_helpers 22 | from estimators import ValueEstimator, PolicyEstimator 23 | from policy_monitor import PolicyMonitor 24 | from worker import Worker 25 | 26 | 27 | tf.flags.DEFINE_string("model_dir", "/tmp/a3c", "Directory to write Tensorboard summaries and videos to.") 28 | tf.flags.DEFINE_string("env", "Breakout-v0", "Name of gym Atari environment, e.g. Breakout-v0") 29 | tf.flags.DEFINE_integer("t_max", 5, "Number of steps before performing an update") 30 | tf.flags.DEFINE_integer("max_global_steps", None, "Stop training after this many steps in the environment. Defaults to running indefinitely.") 31 | tf.flags.DEFINE_integer("eval_every", 300, "Evaluate the policy every N seconds") 32 | tf.flags.DEFINE_boolean("reset", False, "If set, delete the existing model directory and start training from scratch.") 33 | tf.flags.DEFINE_integer("parallelism", None, "Number of threads to run. If not set we run [num_cpu_cores] threads.") 34 | 35 | FLAGS = tf.flags.FLAGS 36 | 37 | def make_env(wrap=True): 38 | env = gym.envs.make(FLAGS.env) 39 | # remove the timelimitwrapper 40 | env = env.env 41 | if wrap: 42 | env = atari_helpers.AtariEnvWrapper(env) 43 | return env 44 | 45 | # Depending on the game we may have a limited action space 46 | env_ = make_env() 47 | if FLAGS.env == "Pong-v0" or FLAGS.env == "Breakout-v0": 48 | VALID_ACTIONS = list(range(4)) 49 | else: 50 | VALID_ACTIONS = list(range(env_.action_space.n)) 51 | env_.close() 52 | 53 | 54 | # Set the number of workers 55 | NUM_WORKERS = multiprocessing.cpu_count() 56 | if FLAGS.parallelism: 57 | NUM_WORKERS = FLAGS.parallelism 58 | 59 | MODEL_DIR = FLAGS.model_dir 60 | CHECKPOINT_DIR = os.path.join(MODEL_DIR, "checkpoints") 61 | 62 | # Optionally empty model directory 63 | if FLAGS.reset: 64 | shutil.rmtree(MODEL_DIR, ignore_errors=True) 65 | 66 | if not os.path.exists(CHECKPOINT_DIR): 67 | os.makedirs(CHECKPOINT_DIR) 68 | 69 | summary_writer = tf.summary.FileWriter(os.path.join(MODEL_DIR, "train")) 70 | 71 | with tf.device("/cpu:0"): 72 | 73 | # Keeps track of the number of updates we've performed 74 | global_step = tf.Variable(0, name="global_step", trainable=False) 75 | 76 | # Global policy and value nets 77 | with tf.variable_scope("global") as vs: 78 | policy_net = PolicyEstimator(num_outputs=len(VALID_ACTIONS)) 79 | value_net = ValueEstimator(reuse=True) 80 | 81 | # Global step iterator 82 | global_counter = itertools.count() 83 | 84 | # Create worker graphs 85 | workers = [] 86 | for worker_id in range(NUM_WORKERS): 87 | # We only write summaries in one of the workers because they're 88 | # pretty much identical and writing them on all workers 89 | # would be a waste of space 90 | worker_summary_writer = None 91 | if worker_id == 0: 92 | worker_summary_writer = summary_writer 93 | 94 | worker = Worker( 95 | name="worker_{}".format(worker_id), 96 | env=make_env(), 97 | policy_net=policy_net, 98 | value_net=value_net, 99 | global_counter=global_counter, 100 | discount_factor = 0.99, 101 | summary_writer=worker_summary_writer, 102 | max_global_steps=FLAGS.max_global_steps) 103 | workers.append(worker) 104 | 105 | saver = tf.train.Saver(keep_checkpoint_every_n_hours=2.0, max_to_keep=10) 106 | 107 | # Used to occasionally save videos for our policy net 108 | # and write episode rewards to Tensorboard 109 | pe = PolicyMonitor( 110 | env=make_env(wrap=False), 111 | policy_net=policy_net, 112 | summary_writer=summary_writer, 113 | saver=saver) 114 | 115 | with tf.Session() as sess: 116 | sess.run(tf.global_variables_initializer()) 117 | coord = tf.train.Coordinator() 118 | 119 | # Load a previous checkpoint if it exists 120 | latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR) 121 | if latest_checkpoint: 122 | print("Loading model checkpoint: {}".format(latest_checkpoint)) 123 | saver.restore(sess, latest_checkpoint) 124 | 125 | # Start worker threads 126 | worker_threads = [] 127 | for worker in workers: 128 | worker_fn = lambda worker=worker: worker.run(sess, coord, FLAGS.t_max) 129 | t = threading.Thread(target=worker_fn) 130 | t.start() 131 | worker_threads.append(t) 132 | 133 | # Start a thread for policy eval task 134 | monitor_thread = threading.Thread(target=lambda: pe.continuous_eval(FLAGS.eval_every, sess, coord)) 135 | monitor_thread.start() 136 | 137 | # Wait for all workers to finish 138 | coord.join(worker_threads) 139 | -------------------------------------------------------------------------------- /DP/Gamblers Problem.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "### This is Example 4.3. Gambler’s Problem from Sutton's book.\n", 10 | "\n", 11 | "A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips. \n", 12 | "If the coin comes up heads, he wins as many dollars as he has staked on that flip; \n", 13 | "if it is tails, he loses his stake. The game ends when the gambler wins by reaching his goal of $100, \n", 14 | "or loses by running out of money. \n", 15 | "\n", 16 | "On each flip, the gambler must decide what portion of his capital to stake, in integer numbers of dollars. \n", 17 | "This problem can be formulated as an undiscounted, episodic, finite MDP. \n", 18 | "\n", 19 | "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n", 20 | "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n", 21 | "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n", 22 | "\n", 23 | "The state-value function then gives the probability of winning from each state. A policy is a mapping from levels of capital to stakes. The optimal policy maximizes the probability of reaching the goal. Let p_h denote the probability of the coin coming up heads. If p_h is known, then the entire problem is known and it can be solved, for instance, by value iteration.\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import numpy as np\n", 35 | "import sys\n", 36 | "import matplotlib.pyplot as plt\n", 37 | "if \"../\" not in sys.path:\n", 38 | " sys.path.append(\"../\") " 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "source": [ 47 | "\n", 48 | "### Exercise 4.9 (programming)\n", 49 | "\n", 50 | "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55.\n", 51 | "\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 1, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "def value_iteration_for_gamblers(p_h, theta=0.0001, discount_factor=1.0):\n", 63 | " \"\"\"\n", 64 | " Args:\n", 65 | " p_h: Probability of the coin coming up heads\n", 66 | " \"\"\"\n", 67 | " \n", 68 | " def one_step_lookahead(s, V, rewards):\n", 69 | " \"\"\"\n", 70 | " Helper function to calculate the value for all action in a given state.\n", 71 | " \n", 72 | " Args:\n", 73 | " s: The gambler’s capital. Integer.\n", 74 | " V: The vector that contains values at each state. \n", 75 | " rewards: The reward vector.\n", 76 | " \n", 77 | " Returns:\n", 78 | " A vector containing the expected value of each action. \n", 79 | " Its length equals to the number of actions.\n", 80 | " \"\"\"\n", 81 | " \n", 82 | " # Implement!\n", 83 | " \n", 84 | " return A\n", 85 | " \n", 86 | " # Implement!\n", 87 | " \n", 88 | " return policy, V" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "policy, v = value_iteration_for_gamblers(0.25)\n", 100 | "\n", 101 | "print(\"Optimized Policy:\")\n", 102 | "print(policy)\n", 103 | "print(\"\")\n", 104 | "\n", 105 | "print(\"Optimized Value Function:\")\n", 106 | "print(v)\n", 107 | "print(\"\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "# Plotting Final Policy (action stake) vs State (Capital)\n", 119 | "\n", 120 | "# Implement!" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 6, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "# Plotting Capital vs Final Policy\n", 132 | "\n", 133 | "# Implement!\n" 134 | ] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 3", 140 | "language": "python", 141 | "name": "python3" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 3 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython3", 153 | "version": "3.6.3" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 1 158 | } 159 | -------------------------------------------------------------------------------- /MC/MC Control with Epsilon-Greedy Policies.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "\n", 13 | "import gym\n", 14 | "import matplotlib\n", 15 | "import numpy as np\n", 16 | "import sys\n", 17 | "\n", 18 | "from collections import defaultdict\n", 19 | "if \"../\" not in sys.path:\n", 20 | " sys.path.append(\"../\") \n", 21 | "from lib.envs.blackjack import BlackjackEnv\n", 22 | "from lib import plotting\n", 23 | "\n", 24 | "matplotlib.style.use('ggplot')" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "env = BlackjackEnv()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "def make_epsilon_greedy_policy(Q, epsilon, nA):\n", 47 | " \"\"\"\n", 48 | " Creates an epsilon-greedy policy based on a given Q-function and epsilon.\n", 49 | " \n", 50 | " Args:\n", 51 | " Q: A dictionary that maps from state -> action-values.\n", 52 | " Each value is a numpy array of length nA (see below)\n", 53 | " epsilon: The probability to select a random action . float between 0 and 1.\n", 54 | " nA: Number of actions in the environment.\n", 55 | " \n", 56 | " Returns:\n", 57 | " A function that takes the observation as an argument and returns\n", 58 | " the probabilities for each action in the form of a numpy array of length nA.\n", 59 | " \n", 60 | " \"\"\"\n", 61 | " def policy_fn(observation):\n", 62 | " pass\n", 63 | " # Implement this!\n", 64 | " return policy_fn" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1):\n", 76 | " \"\"\"\n", 77 | " Monte Carlo Control using Epsilon-Greedy policies.\n", 78 | " Finds an optimal epsilon-greedy policy.\n", 79 | " \n", 80 | " Args:\n", 81 | " env: OpenAI gym environment.\n", 82 | " num_episodes: Number of episodes to sample.\n", 83 | " discount_factor: Gamma discount factor.\n", 84 | " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", 85 | " \n", 86 | " Returns:\n", 87 | " A tuple (Q, policy).\n", 88 | " Q is a dictionary mapping state -> action values.\n", 89 | " policy is a function that takes an observation as an argument and returns\n", 90 | " action probabilities\n", 91 | " \"\"\"\n", 92 | " \n", 93 | " # Keeps track of sum and count of returns for each state\n", 94 | " # to calculate an average. We could use an array to save all\n", 95 | " # returns (like in the book) but that's memory inefficient.\n", 96 | " returns_sum = defaultdict(float)\n", 97 | " returns_count = defaultdict(float)\n", 98 | " \n", 99 | " # The final action-value function.\n", 100 | " # A nested dictionary that maps state -> (action -> action-value).\n", 101 | " Q = defaultdict(lambda: np.zeros(env.action_space.n))\n", 102 | " \n", 103 | " # The policy we're following\n", 104 | " policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)\n", 105 | " \n", 106 | " # Implement this!\n", 107 | " \n", 108 | " return Q, policy" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "# For plotting: Create value function from action-value function\n", 131 | "# by picking the best action at each state\n", 132 | "V = defaultdict(float)\n", 133 | "for state, actions in Q.items():\n", 134 | " action_value = np.max(actions)\n", 135 | " V[state] = action_value\n", 136 | "plotting.plot_value_function(V, title=\"Optimal Value Function\")" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.5.2" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 1 170 | } 171 | -------------------------------------------------------------------------------- /DP/Policy Evaluation Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from IPython.core.debugger import set_trace\n", 10 | "import numpy as np\n", 11 | "import pprint\n", 12 | "import sys\n", 13 | "if \"../\" not in sys.path:\n", 14 | " sys.path.append(\"../\") \n", 15 | "from lib.envs.gridworld import GridworldEnv" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "pp = pprint.PrettyPrinter(indent=2)\n", 25 | "env = GridworldEnv()" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n", 35 | " \"\"\"\n", 36 | " Evaluate a policy given an environment and a full description of the environment's dynamics.\n", 37 | " \n", 38 | " Args:\n", 39 | " policy: [S, A] shaped matrix representing the policy.\n", 40 | " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", 41 | " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", 42 | " env.nS is a number of states in the environment. \n", 43 | " env.nA is a number of actions in the environment.\n", 44 | " theta: We stop evaluation once our value function change is less than theta for all states.\n", 45 | " discount_factor: Gamma discount factor.\n", 46 | " \n", 47 | " Returns:\n", 48 | " Vector of length env.nS representing the value function.\n", 49 | " \"\"\"\n", 50 | " # Start with a random (all 0) value function\n", 51 | " V = np.zeros(env.nS)\n", 52 | " while True:\n", 53 | " delta = 0\n", 54 | " # For each state, perform a \"full backup\"\n", 55 | " for s in range(env.nS):\n", 56 | " v = 0\n", 57 | " # Look at the possible next actions\n", 58 | " for a, action_prob in enumerate(policy[s]):\n", 59 | " # For each action, look at the possible next states...\n", 60 | " for prob, next_state, reward, done in env.P[s][a]:\n", 61 | " # Calculate the expected value. Ref: Sutton book eq. 4.6.\n", 62 | " v += action_prob * prob * (reward + discount_factor * V[next_state])\n", 63 | " # How much our value function changed (across any states)\n", 64 | " delta = max(delta, np.abs(v - V[s]))\n", 65 | " V[s] = v\n", 66 | " # Stop evaluating once our value function change is below a threshold\n", 67 | " if delta < theta:\n", 68 | " break\n", 69 | " return np.array(V)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "random_policy = np.ones([env.nS, env.nA]) / env.nA\n", 79 | "v = policy_eval(random_policy, env)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "Value Function:\n", 92 | "[ 0. -13.99993529 -19.99990698 -21.99989761 -13.99993529\n", 93 | " -17.9999206 -19.99991379 -19.99991477 -19.99990698 -19.99991379\n", 94 | " -17.99992725 -13.99994569 -21.99989761 -19.99991477 -13.99994569\n", 95 | " 0. ]\n", 96 | "\n", 97 | "Reshaped Grid Value Function:\n", 98 | "[[ 0. -13.99993529 -19.99990698 -21.99989761]\n", 99 | " [-13.99993529 -17.9999206 -19.99991379 -19.99991477]\n", 100 | " [-19.99990698 -19.99991379 -17.99992725 -13.99994569]\n", 101 | " [-21.99989761 -19.99991477 -13.99994569 0. ]]\n", 102 | "\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "print(\"Value Function:\")\n", 108 | "print(v)\n", 109 | "print(\"\")\n", 110 | "\n", 111 | "print(\"Reshaped Grid Value Function:\")\n", 112 | "print(v.reshape(env.shape))\n", 113 | "print(\"\")" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# Test: Make sure the evaluated policy is what we expected\n", 123 | "expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])\n", 124 | "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [] 133 | } 134 | ], 135 | "metadata": { 136 | "kernelspec": { 137 | "display_name": "Python 3", 138 | "language": "python", 139 | "name": "python3" 140 | }, 141 | "language_info": { 142 | "codemirror_mode": { 143 | "name": "ipython", 144 | "version": 3 145 | }, 146 | "file_extension": ".py", 147 | "mimetype": "text/x-python", 148 | "name": "python", 149 | "nbconvert_exporter": "python", 150 | "pygments_lexer": "ipython3", 151 | "version": "3.6.4" 152 | } 153 | }, 154 | "nbformat": 4, 155 | "nbformat_minor": 1 156 | } 157 | -------------------------------------------------------------------------------- /MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "\n", 13 | "import gym\n", 14 | "import matplotlib\n", 15 | "import numpy as np\n", 16 | "import sys\n", 17 | "\n", 18 | "from collections import defaultdict\n", 19 | "if \"../\" not in sys.path:\n", 20 | " sys.path.append(\"../\") \n", 21 | "from lib.envs.blackjack import BlackjackEnv\n", 22 | "from lib import plotting\n", 23 | "\n", 24 | "matplotlib.style.use('ggplot')" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "env = BlackjackEnv()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "def create_random_policy(nA):\n", 47 | " \"\"\"\n", 48 | " Creates a random policy function.\n", 49 | " \n", 50 | " Args:\n", 51 | " nA: Number of actions in the environment.\n", 52 | " \n", 53 | " Returns:\n", 54 | " A function that takes an observation as input and returns a vector\n", 55 | " of action probabilities\n", 56 | " \"\"\"\n", 57 | " A = np.ones(nA, dtype=float) / nA\n", 58 | " def policy_fn(observation):\n", 59 | " return A\n", 60 | " return policy_fn" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "def create_greedy_policy(Q):\n", 72 | " \"\"\"\n", 73 | " Creates a greedy policy based on Q values.\n", 74 | " \n", 75 | " Args:\n", 76 | " Q: A dictionary that maps from state -> action values\n", 77 | " \n", 78 | " Returns:\n", 79 | " A function that takes an observation as input and returns a vector\n", 80 | " of action probabilities.\n", 81 | " \"\"\"\n", 82 | " \n", 83 | " def policy_fn(observation):\n", 84 | " pass\n", 85 | " # Implement this!\n", 86 | " return policy_fn" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n", 98 | " \"\"\"\n", 99 | " Monte Carlo Control Off-Policy Control using Weighted Importance Sampling.\n", 100 | " Finds an optimal greedy policy.\n", 101 | " \n", 102 | " Args:\n", 103 | " env: OpenAI gym environment.\n", 104 | " num_episodes: Number of episodes to sample.\n", 105 | " behavior_policy: The behavior to follow while generating episodes.\n", 106 | " A function that given an observation returns a vector of probabilities for each action.\n", 107 | " discount_factor: Gamma discount factor.\n", 108 | " \n", 109 | " Returns:\n", 110 | " A tuple (Q, policy).\n", 111 | " Q is a dictionary mapping state -> action values.\n", 112 | " policy is a function that takes an observation as an argument and returns\n", 113 | " action probabilities. This is the optimal greedy policy.\n", 114 | " \"\"\"\n", 115 | " \n", 116 | " # The final action-value function.\n", 117 | " # A dictionary that maps state -> action values\n", 118 | " Q = defaultdict(lambda: np.zeros(env.action_space.n))\n", 119 | " \n", 120 | " # Our greedily policy we want to learn\n", 121 | " target_policy = create_greedy_policy(Q)\n", 122 | " \n", 123 | " # Implement this!\n", 124 | " \n", 125 | " return Q, target_policy" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "random_policy = create_random_policy(env.action_space.n)\n", 137 | "Q, policy = mc_control_importance_sampling(env, num_episodes=500000, behavior_policy=random_policy)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "# For plotting: Create value function from action-value function\n", 149 | "# by picking the best action at each state\n", 150 | "V = defaultdict(float)\n", 151 | "for state, action_values in Q.items():\n", 152 | " action_value = np.max(action_values)\n", 153 | " V[state] = action_value\n", 154 | "plotting.plot_value_function(V, title=\"Optimal Value Function\")" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "collapsed": true 162 | }, 163 | "outputs": [], 164 | "source": [] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.5.2" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 1 188 | } 189 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Overview 2 | 3 | This repository provides code, exercises and solutions for popular Reinforcement Learning algorithms. These are meant to serve as a learning tool to complement the theoretical materials from 4 | 5 | - [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf) 6 | - [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) 7 | 8 | Each folder in corresponds to one or more chapters of the above textbook and/or course. In addition to exercises and solution, each folder also contains a list of learning goals, a brief concept summary, and links to the relevant readings. 9 | 10 | All code is written in Python 3 and uses RL environments from [OpenAI Gym](https://gym.openai.com/). Advanced techniques use [Tensorflow](https://www.tensorflow.org/) for neural network implementations. 11 | 12 | 13 | ### Table of Contents 14 | 15 | - [Introduction to RL problems & OpenAI Gym](Introduction/) 16 | - [MDPs and Bellman Equations](MDP/) 17 | - [Dynamic Programming: Model-Based RL, Policy Iteration and Value Iteration](DP/) 18 | - [Monte Carlo Model-Free Prediction & Control](MC/) 19 | - [Temporal Difference Model-Free Prediction & Control](TD/) 20 | - [Function Approximation](FA/) 21 | - [Deep Q Learning](DQN/) (WIP) 22 | - [Policy Gradient Methods](PolicyGradient/) (WIP) 23 | - Learning and Planning (WIP) 24 | - Exploration and Exploitation (WIP) 25 | 26 | 27 | ### List of Implemented Algorithms 28 | 29 | - [Dynamic Programming Policy Evaluation](DP/Policy%20Evaluation%20Solution.ipynb) 30 | - [Dynamic Programming Policy Iteration](DP/Policy%20Iteration%20Solution.ipynb) 31 | - [Dynamic Programming Value Iteration](DP/Value%20Iteration%20Solution.ipynb) 32 | - [Monte Carlo Prediction](MC/MC%20Prediction%20Solution.ipynb) 33 | - [Monte Carlo Control with Epsilon-Greedy Policies](MC/MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb) 34 | - [Monte Carlo Off-Policy Control with Importance Sampling](MC/Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb) 35 | - [SARSA (On Policy TD Learning)](TD/SARSA%20Solution.ipynb) 36 | - [Q-Learning (Off Policy TD Learning)](TD/Q-Learning%20Solution.ipynb) 37 | - [Q-Learning with Linear Function Approximation](FA/Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb) 38 | - [Deep Q-Learning for Atari Games](DQN/Deep%20Q%20Learning%20Solution.ipynb) 39 | - [Double Deep-Q Learning for Atari Games](DQN/Double%20DQN%20Solution.ipynb) 40 | - Deep Q-Learning with Prioritized Experience Replay (WIP) 41 | - [Policy Gradient: REINFORCE with Baseline](PolicyGradient/CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb) 42 | - [Policy Gradient: Actor Critic with Baseline](PolicyGradient/CliffWalk%20Actor%20Critic%20Solution.ipynb) 43 | - [Policy Gradient: Actor Critic with Baseline for Continuous Action Spaces](PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb) 44 | - Deterministic Policy Gradients for Continuous Action Spaces (WIP) 45 | - Deep Deterministic Policy Gradients (DDPG) (WIP) 46 | - [Asynchronous Advantage Actor Critic (A3C)](PolicyGradient/a3c) 47 | 48 | 49 | ### Resources 50 | 51 | Textbooks: 52 | 53 | - [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf) 54 | 55 | Classes: 56 | 57 | - [David Silver's Reinforcement Learning Course (UCL, 2015)](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) 58 | - [CS294 - Deep Reinforcement Learning (Berkeley, Fall 2015)](http://rll.berkeley.edu/deeprlcourse/) 59 | - [CS 8803 - Reinforcement Learning (Georgia Tech)](https://www.udacity.com/course/reinforcement-learning--ud600) 60 | - [CS885 - Reinforcement Learning (UWaterloo), Spring 2018](https://cs.uwaterloo.ca/~ppoupart/teaching/cs885-spring18/) 61 | - [CS294-112 - Deep Reinforcement Learning (UC Berkeley)](http://rail.eecs.berkeley.edu/deeprlcourse/) 62 | 63 | Talks/Tutorials: 64 | 65 | - [Introduction to Reinforcement Learning (Joelle Pineau @ Deep Learning Summer School 2016)](http://videolectures.net/deeplearning2016_pineau_reinforcement_learning/) 66 | - [Deep Reinforcement Learning (Pieter Abbeel @ Deep Learning Summer School 2016)](http://videolectures.net/deeplearning2016_abbeel_deep_reinforcement/) 67 | - [Deep Reinforcement Learning ICML 2016 Tutorial (David Silver)](http://techtalks.tv/talks/deep-reinforcement-learning/62360/) 68 | - [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.youtube.com/watch?v=ggqnxyjaKe4) 69 | - [John Schulman - Deep Reinforcement Learning (4 Lectures)](https://www.youtube.com/playlist?list=PLjKEIQlKCTZYN3CYBlj8r58SbNorobqcp) 70 | - [Deep Reinforcement Learning Slides @ NIPS 2016](http://people.eecs.berkeley.edu/~pabbeel/nips-tutorial-policy-optimization-Schulman-Abbeel.pdf) 71 | - [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/user/introduction.html) 72 | - [Advanced Deep Learning & Reinforcement Learning (UCL 2018, DeepMind)](https://www.youtube.com/playlist?list=PLqYmG7hTraZDNJre23vqCGIVpfZ_K2RZs) 73 | -[Deep RL Bootcamp](https://sites.google.com/view/deep-rl-bootcamp/lectures) 74 | 75 | Other Projects: 76 | 77 | - [carpedm20/deep-rl-tensorflow](https://github.com/carpedm20/deep-rl-tensorflow) 78 | - [matthiasplappert/keras-rl](https://github.com/matthiasplappert/keras-rl) 79 | 80 | Selected Papers: 81 | 82 | - [Human-Level Control through Deep Reinforcement Learning (2015-02)](http://www.readcube.com/articles/10.1038/nature14236) 83 | - [Deep Reinforcement Learning with Double Q-learning (2015-09)](http://arxiv.org/abs/1509.06461) 84 | - [Continuous control with deep reinforcement learning (2015-09)](https://arxiv.org/abs/1509.02971) 85 | - [Prioritized Experience Replay (2015-11)](http://arxiv.org/abs/1511.05952) 86 | - [Dueling Network Architectures for Deep Reinforcement Learning (2015-11)](http://arxiv.org/abs/1511.06581) 87 | - [Asynchronous Methods for Deep Reinforcement Learning (2016-02)](http://arxiv.org/abs/1602.01783) 88 | - [Deep Reinforcement Learning from Self-Play in Imperfect-Information Games (2016-03)](http://arxiv.org/abs/1603.01121) 89 | - [Mastering the game of Go with deep neural networks and tree search](https://gogameguru.com/i/2016/03/deepmind-mastering-go.pdf) 90 | -------------------------------------------------------------------------------- /DP/Value Iteration Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pprint\n", 11 | "import sys\n", 12 | "if \"../\" not in sys.path:\n", 13 | " sys.path.append(\"../\") \n", 14 | "from lib.envs.gridworld import GridworldEnv" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "pp = pprint.PrettyPrinter(indent=2)\n", 24 | "env = GridworldEnv()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n", 34 | " \"\"\"\n", 35 | " Value Iteration Algorithm.\n", 36 | " \n", 37 | " Args:\n", 38 | " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", 39 | " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", 40 | " env.nS is a number of states in the environment. \n", 41 | " env.nA is a number of actions in the environment.\n", 42 | " theta: We stop evaluation once our value function change is less than theta for all states.\n", 43 | " discount_factor: Gamma discount factor.\n", 44 | " \n", 45 | " Returns:\n", 46 | " A tuple (policy, V) of the optimal policy and the optimal value function.\n", 47 | " \"\"\"\n", 48 | " \n", 49 | " def one_step_lookahead(state, V):\n", 50 | " \"\"\"\n", 51 | " Helper function to calculate the value for all action in a given state.\n", 52 | " \n", 53 | " Args:\n", 54 | " state: The state to consider (int)\n", 55 | " V: The value to use as an estimator, Vector of length env.nS\n", 56 | " \n", 57 | " Returns:\n", 58 | " A vector of length env.nA containing the expected value of each action.\n", 59 | " \"\"\"\n", 60 | " A = np.zeros(env.nA)\n", 61 | " for a in range(env.nA):\n", 62 | " for prob, next_state, reward, done in env.P[state][a]:\n", 63 | " A[a] += prob * (reward + discount_factor * V[next_state])\n", 64 | " return A\n", 65 | " \n", 66 | " V = np.zeros(env.nS)\n", 67 | " while True:\n", 68 | " # Stopping condition\n", 69 | " delta = 0\n", 70 | " # Update each state...\n", 71 | " for s in range(env.nS):\n", 72 | " # Do a one-step lookahead to find the best action\n", 73 | " A = one_step_lookahead(s, V)\n", 74 | " best_action_value = np.max(A)\n", 75 | " # Calculate delta across all states seen so far\n", 76 | " delta = max(delta, np.abs(best_action_value - V[s]))\n", 77 | " # Update the value function. Ref: Sutton book eq. 4.10. \n", 78 | " V[s] = best_action_value \n", 79 | " # Check if we can stop \n", 80 | " if delta < theta:\n", 81 | " break\n", 82 | " \n", 83 | " # Create a deterministic policy using the optimal value function\n", 84 | " policy = np.zeros([env.nS, env.nA])\n", 85 | " for s in range(env.nS):\n", 86 | " # One step lookahead to find the best action for this state\n", 87 | " A = one_step_lookahead(s, V)\n", 88 | " best_action = np.argmax(A)\n", 89 | " # Always take the best action\n", 90 | " policy[s, best_action] = 1.0\n", 91 | " \n", 92 | " return policy, V" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 4, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "Policy Probability Distribution:\n", 105 | "[[1. 0. 0. 0.]\n", 106 | " [0. 0. 0. 1.]\n", 107 | " [0. 0. 0. 1.]\n", 108 | " [0. 0. 1. 0.]\n", 109 | " [1. 0. 0. 0.]\n", 110 | " [1. 0. 0. 0.]\n", 111 | " [1. 0. 0. 0.]\n", 112 | " [0. 0. 1. 0.]\n", 113 | " [1. 0. 0. 0.]\n", 114 | " [1. 0. 0. 0.]\n", 115 | " [0. 1. 0. 0.]\n", 116 | " [0. 0. 1. 0.]\n", 117 | " [1. 0. 0. 0.]\n", 118 | " [0. 1. 0. 0.]\n", 119 | " [0. 1. 0. 0.]\n", 120 | " [1. 0. 0. 0.]]\n", 121 | "\n", 122 | "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n", 123 | "[[0 3 3 2]\n", 124 | " [0 0 0 2]\n", 125 | " [0 0 1 2]\n", 126 | " [0 1 1 0]]\n", 127 | "\n", 128 | "Value Function:\n", 129 | "[ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1. 0.]\n", 130 | "\n", 131 | "Reshaped Grid Value Function:\n", 132 | "[[ 0. -1. -2. -3.]\n", 133 | " [-1. -2. -3. -2.]\n", 134 | " [-2. -3. -2. -1.]\n", 135 | " [-3. -2. -1. 0.]]\n", 136 | "\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "policy, v = value_iteration(env)\n", 142 | "\n", 143 | "print(\"Policy Probability Distribution:\")\n", 144 | "print(policy)\n", 145 | "print(\"\")\n", 146 | "\n", 147 | "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n", 148 | "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n", 149 | "print(\"\")\n", 150 | "\n", 151 | "print(\"Value Function:\")\n", 152 | "print(v)\n", 153 | "print(\"\")\n", 154 | "\n", 155 | "print(\"Reshaped Grid Value Function:\")\n", 156 | "print(v.reshape(env.shape))\n", 157 | "print(\"\")" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "# Test the value function\n", 167 | "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])\n", 168 | "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | } 178 | ], 179 | "metadata": { 180 | "anaconda-cloud": {}, 181 | "kernelspec": { 182 | "display_name": "Python 3", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.6.4" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 1 201 | } 202 | -------------------------------------------------------------------------------- /PolicyGradient/a3c/estimators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | def build_shared_network(X, add_summaries=False): 5 | """ 6 | Builds a 3-layer network conv -> conv -> fc as described 7 | in the A3C paper. This network is shared by both the policy and value net. 8 | 9 | Args: 10 | X: Inputs 11 | add_summaries: If true, add layer summaries to Tensorboard. 12 | 13 | Returns: 14 | Final layer activations. 15 | """ 16 | 17 | # Three convolutional layers 18 | conv1 = tf.contrib.layers.conv2d( 19 | X, 16, 8, 4, activation_fn=tf.nn.relu, scope="conv1") 20 | conv2 = tf.contrib.layers.conv2d( 21 | conv1, 32, 4, 2, activation_fn=tf.nn.relu, scope="conv2") 22 | 23 | # Fully connected layer 24 | fc1 = tf.contrib.layers.fully_connected( 25 | inputs=tf.contrib.layers.flatten(conv2), 26 | num_outputs=256, 27 | scope="fc1") 28 | 29 | if add_summaries: 30 | tf.contrib.layers.summarize_activation(conv1) 31 | tf.contrib.layers.summarize_activation(conv2) 32 | tf.contrib.layers.summarize_activation(fc1) 33 | 34 | return fc1 35 | 36 | class PolicyEstimator(): 37 | """ 38 | Policy Function approximator. Given a observation, returns probabilities 39 | over all possible actions. 40 | 41 | Args: 42 | num_outputs: Size of the action space. 43 | reuse: If true, an existing shared network will be re-used. 44 | trainable: If true we add train ops to the network. 45 | Actor threads that don't update their local models and don't need 46 | train ops would set this to false. 47 | """ 48 | 49 | def __init__(self, num_outputs, reuse=False, trainable=True): 50 | self.num_outputs = num_outputs 51 | 52 | # Placeholders for our input 53 | # Our input are 4 RGB frames of shape 160, 160 each 54 | self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X") 55 | # The TD target value 56 | self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y") 57 | # Integer id of which action was selected 58 | self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions") 59 | 60 | # Normalize 61 | X = tf.to_float(self.states) / 255.0 62 | batch_size = tf.shape(self.states)[0] 63 | 64 | # Graph shared with Value Net 65 | with tf.variable_scope("shared", reuse=reuse): 66 | fc1 = build_shared_network(X, add_summaries=(not reuse)) 67 | 68 | 69 | with tf.variable_scope("policy_net"): 70 | self.logits = tf.contrib.layers.fully_connected(fc1, num_outputs, activation_fn=None) 71 | self.probs = tf.nn.softmax(self.logits) + 1e-8 72 | 73 | self.predictions = { 74 | "logits": self.logits, 75 | "probs": self.probs 76 | } 77 | 78 | # We add entropy to the loss to encourage exploration 79 | self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), 1, name="entropy") 80 | self.entropy_mean = tf.reduce_mean(self.entropy, name="entropy_mean") 81 | 82 | # Get the predictions for the chosen actions only 83 | gather_indices = tf.range(batch_size) * tf.shape(self.probs)[1] + self.actions 84 | self.picked_action_probs = tf.gather(tf.reshape(self.probs, [-1]), gather_indices) 85 | 86 | self.losses = - (tf.log(self.picked_action_probs) * self.targets + 0.01 * self.entropy) 87 | self.loss = tf.reduce_sum(self.losses, name="loss") 88 | 89 | tf.summary.scalar(self.loss.op.name, self.loss) 90 | tf.summary.scalar(self.entropy_mean.op.name, self.entropy_mean) 91 | tf.summary.histogram(self.entropy.op.name, self.entropy) 92 | 93 | if trainable: 94 | # self.optimizer = tf.train.AdamOptimizer(1e-4) 95 | self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) 96 | self.grads_and_vars = self.optimizer.compute_gradients(self.loss) 97 | self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None] 98 | self.train_op = self.optimizer.apply_gradients(self.grads_and_vars, 99 | global_step=tf.contrib.framework.get_global_step()) 100 | 101 | # Merge summaries from this network and the shared network (but not the value net) 102 | var_scope_name = tf.get_variable_scope().name 103 | summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES) 104 | sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name] 105 | sumaries = [s for s in summary_ops if var_scope_name in s.name] 106 | self.summaries = tf.summary.merge(sumaries) 107 | 108 | 109 | class ValueEstimator(): 110 | """ 111 | Value Function approximator. Returns a value estimator for a batch of observations. 112 | 113 | Args: 114 | reuse: If true, an existing shared network will be re-used. 115 | trainable: If true we add train ops to the network. 116 | Actor threads that don't update their local models and don't need 117 | train ops would set this to false. 118 | """ 119 | 120 | def __init__(self, reuse=False, trainable=True): 121 | # Placeholders for our input 122 | # Our input are 4 RGB frames of shape 160, 160 each 123 | self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X") 124 | # The TD target value 125 | self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y") 126 | 127 | X = tf.to_float(self.states) / 255.0 128 | 129 | # Graph shared with Value Net 130 | with tf.variable_scope("shared", reuse=reuse): 131 | fc1 = build_shared_network(X, add_summaries=(not reuse)) 132 | 133 | with tf.variable_scope("value_net"): 134 | self.logits = tf.contrib.layers.fully_connected( 135 | inputs=fc1, 136 | num_outputs=1, 137 | activation_fn=None) 138 | self.logits = tf.squeeze(self.logits, squeeze_dims=[1], name="logits") 139 | 140 | self.losses = tf.squared_difference(self.logits, self.targets) 141 | self.loss = tf.reduce_sum(self.losses, name="loss") 142 | 143 | self.predictions = { 144 | "logits": self.logits 145 | } 146 | 147 | # Summaries 148 | prefix = tf.get_variable_scope().name 149 | tf.summary.scalar(self.loss.name, self.loss) 150 | tf.summary.scalar("{}/max_value".format(prefix), tf.reduce_max(self.logits)) 151 | tf.summary.scalar("{}/min_value".format(prefix), tf.reduce_min(self.logits)) 152 | tf.summary.scalar("{}/mean_value".format(prefix), tf.reduce_mean(self.logits)) 153 | tf.summary.scalar("{}/reward_max".format(prefix), tf.reduce_max(self.targets)) 154 | tf.summary.scalar("{}/reward_min".format(prefix), tf.reduce_min(self.targets)) 155 | tf.summary.scalar("{}/reward_mean".format(prefix), tf.reduce_mean(self.targets)) 156 | tf.summary.histogram("{}/reward_targets".format(prefix), self.targets) 157 | tf.summary.histogram("{}/values".format(prefix), self.logits) 158 | 159 | if trainable: 160 | # self.optimizer = tf.train.AdamOptimizer(1e-4) 161 | self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) 162 | self.grads_and_vars = self.optimizer.compute_gradients(self.loss) 163 | self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None] 164 | self.train_op = self.optimizer.apply_gradients(self.grads_and_vars, 165 | global_step=tf.contrib.framework.get_global_step()) 166 | 167 | var_scope_name = tf.get_variable_scope().name 168 | summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES) 169 | sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name] 170 | sumaries = [s for s in summary_ops if var_scope_name in s.name] 171 | self.summaries = tf.summary.merge(sumaries) 172 | -------------------------------------------------------------------------------- /DP/Policy Evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 23, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import sys\n", 13 | "if \"../\" not in sys.path:\n", 14 | " sys.path.append(\"../\") \n", 15 | "from lib.envs.gridworld import GridworldEnv" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 24, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "env = GridworldEnv()" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 25, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n", 38 | " \"\"\"\n", 39 | " Evaluate a policy given an environment and a full description of the environment's dynamics.\n", 40 | " \n", 41 | " Args:\n", 42 | " policy: [S, A] shaped matrix representing the policy.\n", 43 | " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", 44 | " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", 45 | " env.nS is a number of states in the environment. \n", 46 | " env.nA is a number of actions in the environment.\n", 47 | " theta: We stop evaluation once our value function change is less than theta for all states.\n", 48 | " discount_factor: Gamma discount factor.\n", 49 | " \n", 50 | " Returns:\n", 51 | " Vector of length env.nS representing the value function.\n", 52 | " \"\"\"\n", 53 | " # Start with a random (all 0) value function\n", 54 | " V = np.zeros(env.nS)\n", 55 | " while True:\n", 56 | " # TODO: Implement!\n", 57 | " break\n", 58 | " return np.array(V)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 26, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "random_policy = np.ones([env.nS, env.nA]) / env.nA\n", 70 | "v = policy_eval(random_policy, env)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 22, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "ename": "AssertionError", 80 | "evalue": "\nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n 0., 0., 0.])\n y: array([ 0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22,\n -20, -14, 0])", 81 | "output_type": "error", 82 | "traceback": [ 83 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 84 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 85 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Test: Make sure the evaluated policy is what we expected\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mexpected_v\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m22\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m18\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m18\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m22\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtesting\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_array_almost_equal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpected_v\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 86 | "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_almost_equal\u001b[0;34m(x, y, decimal, err_msg, verbose)\u001b[0m\n\u001b[1;32m 914\u001b[0m assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,\n\u001b[1;32m 915\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Arrays are not almost equal to %d decimals'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 916\u001b[0;31m precision=decimal)\n\u001b[0m\u001b[1;32m 917\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 87 | "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_compare\u001b[0;34m(comparison, x, y, err_msg, verbose, header, precision)\u001b[0m\n\u001b[1;32m 735\u001b[0m names=('x', 'y'), precision=precision)\n\u001b[1;32m 736\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcond\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 737\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mAssertionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 738\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 739\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtraceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 88 | "\u001b[0;31mAssertionError\u001b[0m: \nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n 0., 0., 0.])\n y: array([ 0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22,\n -20, -14, 0])" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "# Test: Make sure the evaluated policy is what we expected\n", 94 | "expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])\n", 95 | "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [] 106 | } 107 | ], 108 | "metadata": { 109 | "kernelspec": { 110 | "display_name": "Python 3", 111 | "language": "python", 112 | "name": "python3" 113 | }, 114 | "language_info": { 115 | "codemirror_mode": { 116 | "name": "ipython", 117 | "version": 3 118 | }, 119 | "file_extension": ".py", 120 | "mimetype": "text/x-python", 121 | "name": "python", 122 | "nbconvert_exporter": "python", 123 | "pygments_lexer": "ipython3", 124 | "version": "3.5.2" 125 | } 126 | }, 127 | "nbformat": 4, 128 | "nbformat_minor": 1 129 | } 130 | -------------------------------------------------------------------------------- /PolicyGradient/a3c/worker.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import sys 3 | import os 4 | import itertools 5 | import collections 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from inspect import getsourcefile 10 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0))) 11 | import_path = os.path.abspath(os.path.join(current_path, "../..")) 12 | 13 | if import_path not in sys.path: 14 | sys.path.append(import_path) 15 | 16 | # from lib import plotting 17 | from lib.atari.state_processor import StateProcessor 18 | from lib.atari import helpers as atari_helpers 19 | from estimators import ValueEstimator, PolicyEstimator 20 | 21 | Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) 22 | 23 | 24 | def make_copy_params_op(v1_list, v2_list): 25 | """ 26 | Creates an operation that copies parameters from variable in v1_list to variables in v2_list. 27 | The ordering of the variables in the lists must be identical. 28 | """ 29 | v1_list = list(sorted(v1_list, key=lambda v: v.name)) 30 | v2_list = list(sorted(v2_list, key=lambda v: v.name)) 31 | 32 | update_ops = [] 33 | for v1, v2 in zip(v1_list, v2_list): 34 | op = v2.assign(v1) 35 | update_ops.append(op) 36 | 37 | return update_ops 38 | 39 | def make_train_op(local_estimator, global_estimator): 40 | """ 41 | Creates an op that applies local estimator gradients 42 | to the global estimator. 43 | """ 44 | local_grads, _ = zip(*local_estimator.grads_and_vars) 45 | # Clip gradients 46 | local_grads, _ = tf.clip_by_global_norm(local_grads, 5.0) 47 | _, global_vars = zip(*global_estimator.grads_and_vars) 48 | local_global_grads_and_vars = list(zip(local_grads, global_vars)) 49 | return global_estimator.optimizer.apply_gradients(local_global_grads_and_vars, 50 | global_step=tf.contrib.framework.get_global_step()) 51 | 52 | 53 | class Worker(object): 54 | """ 55 | An A3C worker thread. Runs episodes locally and updates global shared value and policy nets. 56 | 57 | Args: 58 | name: A unique name for this worker 59 | env: The Gym environment used by this worker 60 | policy_net: Instance of the globally shared policy net 61 | value_net: Instance of the globally shared value net 62 | global_counter: Iterator that holds the global step 63 | discount_factor: Reward discount factor 64 | summary_writer: A tf.train.SummaryWriter for Tensorboard summaries 65 | max_global_steps: If set, stop coordinator when global_counter > max_global_steps 66 | """ 67 | def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None): 68 | self.name = name 69 | self.discount_factor = discount_factor 70 | self.max_global_steps = max_global_steps 71 | self.global_step = tf.contrib.framework.get_global_step() 72 | self.global_policy_net = policy_net 73 | self.global_value_net = value_net 74 | self.global_counter = global_counter 75 | self.local_counter = itertools.count() 76 | self.sp = StateProcessor() 77 | self.summary_writer = summary_writer 78 | self.env = env 79 | 80 | # Create local policy/value nets that are not updated asynchronously 81 | with tf.variable_scope(name): 82 | self.policy_net = PolicyEstimator(policy_net.num_outputs) 83 | self.value_net = ValueEstimator(reuse=True) 84 | 85 | # Op to copy params from global policy/valuenets 86 | self.copy_params_op = make_copy_params_op( 87 | tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), 88 | tf.contrib.slim.get_variables(scope=self.name+'/', collection=tf.GraphKeys.TRAINABLE_VARIABLES)) 89 | 90 | self.vnet_train_op = make_train_op(self.value_net, self.global_value_net) 91 | self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net) 92 | 93 | self.state = None 94 | 95 | def run(self, sess, coord, t_max): 96 | with sess.as_default(), sess.graph.as_default(): 97 | # Initial state 98 | self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) 99 | try: 100 | while not coord.should_stop(): 101 | # Copy Parameters from the global networks 102 | sess.run(self.copy_params_op) 103 | 104 | # Collect some experience 105 | transitions, local_t, global_t = self.run_n_steps(t_max, sess) 106 | 107 | if self.max_global_steps is not None and global_t >= self.max_global_steps: 108 | tf.logging.info("Reached global step {}. Stopping.".format(global_t)) 109 | coord.request_stop() 110 | return 111 | 112 | # Update the global networks 113 | self.update(transitions, sess) 114 | 115 | except tf.errors.CancelledError: 116 | return 117 | 118 | def _policy_net_predict(self, state, sess): 119 | feed_dict = { self.policy_net.states: [state] } 120 | preds = sess.run(self.policy_net.predictions, feed_dict) 121 | return preds["probs"][0] 122 | 123 | def _value_net_predict(self, state, sess): 124 | feed_dict = { self.value_net.states: [state] } 125 | preds = sess.run(self.value_net.predictions, feed_dict) 126 | return preds["logits"][0] 127 | 128 | def run_n_steps(self, n, sess): 129 | transitions = [] 130 | for _ in range(n): 131 | # Take a step 132 | action_probs = self._policy_net_predict(self.state, sess) 133 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 134 | next_state, reward, done, _ = self.env.step(action) 135 | next_state = atari_helpers.atari_make_next_state(self.state, self.sp.process(next_state)) 136 | 137 | # Store transition 138 | transitions.append(Transition( 139 | state=self.state, action=action, reward=reward, next_state=next_state, done=done)) 140 | 141 | # Increase local and global counters 142 | local_t = next(self.local_counter) 143 | global_t = next(self.global_counter) 144 | 145 | if local_t % 100 == 0: 146 | tf.logging.info("{}: local Step {}, global step {}".format(self.name, local_t, global_t)) 147 | 148 | if done: 149 | self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) 150 | break 151 | else: 152 | self.state = next_state 153 | return transitions, local_t, global_t 154 | 155 | def update(self, transitions, sess): 156 | """ 157 | Updates global policy and value networks based on collected experience 158 | 159 | Args: 160 | transitions: A list of experience transitions 161 | sess: A Tensorflow session 162 | """ 163 | 164 | # If we episode was not done we bootstrap the value from the last state 165 | reward = 0.0 166 | if not transitions[-1].done: 167 | reward = self._value_net_predict(transitions[-1].next_state, sess) 168 | 169 | # Accumulate minibatch exmaples 170 | states = [] 171 | policy_targets = [] 172 | value_targets = [] 173 | actions = [] 174 | 175 | for transition in transitions[::-1]: 176 | reward = transition.reward + self.discount_factor * reward 177 | policy_target = (reward - self._value_net_predict(transition.state, sess)) 178 | # Accumulate updates 179 | states.append(transition.state) 180 | actions.append(transition.action) 181 | policy_targets.append(policy_target) 182 | value_targets.append(reward) 183 | 184 | feed_dict = { 185 | self.policy_net.states: np.array(states), 186 | self.policy_net.targets: policy_targets, 187 | self.policy_net.actions: actions, 188 | self.value_net.states: np.array(states), 189 | self.value_net.targets: value_targets, 190 | } 191 | 192 | # Train the global estimators using local gradients 193 | global_step, pnet_loss, vnet_loss, _, _, pnet_summaries, vnet_summaries = sess.run([ 194 | self.global_step, 195 | self.policy_net.loss, 196 | self.value_net.loss, 197 | self.pnet_train_op, 198 | self.vnet_train_op, 199 | self.policy_net.summaries, 200 | self.value_net.summaries 201 | ], feed_dict) 202 | 203 | # Write summaries 204 | if self.summary_writer is not None: 205 | self.summary_writer.add_summary(pnet_summaries, global_step) 206 | self.summary_writer.add_summary(vnet_summaries, global_step) 207 | self.summary_writer.flush() 208 | 209 | return pnet_loss, vnet_loss, pnet_summaries, vnet_summaries 210 | -------------------------------------------------------------------------------- /MC/Blackjack Playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import sys\n", 11 | "if \"../\" not in sys.path:\n", 12 | " sys.path.append(\"../\") \n", 13 | "from lib.envs.blackjack import BlackjackEnv" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "env = BlackjackEnv()" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Player Score: 19 (Usable Ace: False), Dealer Score: 5\n", 35 | "Taking action: Hit\n", 36 | "Player Score: 27 (Usable Ace: False), Dealer Score: 5\n", 37 | "Game end. Reward: -1.0\n", 38 | "\n", 39 | "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", 40 | "Taking action: Stick\n", 41 | "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", 42 | "Game end. Reward: 0.0\n", 43 | "\n", 44 | "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", 45 | "Taking action: Stick\n", 46 | "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", 47 | "Game end. Reward: 1.0\n", 48 | "\n", 49 | "Player Score: 14 (Usable Ace: True), Dealer Score: 10\n", 50 | "Taking action: Hit\n", 51 | "Player Score: 19 (Usable Ace: True), Dealer Score: 10\n", 52 | "Taking action: Hit\n", 53 | "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n", 54 | "Taking action: Hit\n", 55 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 56 | "Taking action: Stick\n", 57 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 58 | "Game end. Reward: 1.0\n", 59 | "\n", 60 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 61 | "Taking action: Stick\n", 62 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 63 | "Game end. Reward: 1.0\n", 64 | "\n", 65 | "Player Score: 18 (Usable Ace: False), Dealer Score: 6\n", 66 | "Taking action: Hit\n", 67 | "Player Score: 27 (Usable Ace: False), Dealer Score: 6\n", 68 | "Game end. Reward: -1.0\n", 69 | "\n", 70 | "Player Score: 16 (Usable Ace: False), Dealer Score: 3\n", 71 | "Taking action: Hit\n", 72 | "Player Score: 18 (Usable Ace: False), Dealer Score: 3\n", 73 | "Taking action: Hit\n", 74 | "Player Score: 23 (Usable Ace: False), Dealer Score: 3\n", 75 | "Game end. Reward: -1.0\n", 76 | "\n", 77 | "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", 78 | "Taking action: Hit\n", 79 | "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n", 80 | "Game end. Reward: -1.0\n", 81 | "\n", 82 | "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n", 83 | "Taking action: Hit\n", 84 | "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n", 85 | "Taking action: Stick\n", 86 | "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n", 87 | "Game end. Reward: 1.0\n", 88 | "\n", 89 | "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n", 90 | "Taking action: Stick\n", 91 | "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n", 92 | "Game end. Reward: 1.0\n", 93 | "\n", 94 | "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n", 95 | "Taking action: Hit\n", 96 | "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", 97 | "Taking action: Hit\n", 98 | "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n", 99 | "Game end. Reward: -1.0\n", 100 | "\n", 101 | "Player Score: 14 (Usable Ace: False), Dealer Score: 10\n", 102 | "Taking action: Hit\n", 103 | "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n", 104 | "Game end. Reward: -1.0\n", 105 | "\n", 106 | "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n", 107 | "Taking action: Hit\n", 108 | "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n", 109 | "Taking action: Hit\n", 110 | "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", 111 | "Taking action: Hit\n", 112 | "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n", 113 | "Game end. Reward: -1.0\n", 114 | "\n", 115 | "Player Score: 16 (Usable Ace: True), Dealer Score: 8\n", 116 | "Taking action: Hit\n", 117 | "Player Score: 18 (Usable Ace: True), Dealer Score: 8\n", 118 | "Taking action: Hit\n", 119 | "Player Score: 18 (Usable Ace: False), Dealer Score: 8\n", 120 | "Taking action: Hit\n", 121 | "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n", 122 | "Taking action: Stick\n", 123 | "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n", 124 | "Game end. Reward: 1.0\n", 125 | "\n", 126 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 127 | "Taking action: Stick\n", 128 | "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", 129 | "Game end. Reward: -1.0\n", 130 | "\n", 131 | "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n", 132 | "Taking action: Hit\n", 133 | "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", 134 | "Taking action: Hit\n", 135 | "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n", 136 | "Game end. Reward: -1.0\n", 137 | "\n", 138 | "Player Score: 12 (Usable Ace: False), Dealer Score: 4\n", 139 | "Taking action: Hit\n", 140 | "Player Score: 16 (Usable Ace: False), Dealer Score: 4\n", 141 | "Taking action: Hit\n", 142 | "Player Score: 24 (Usable Ace: False), Dealer Score: 4\n", 143 | "Game end. Reward: -1.0\n", 144 | "\n", 145 | "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n", 146 | "Taking action: Stick\n", 147 | "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n", 148 | "Game end. Reward: 1.0\n", 149 | "\n", 150 | "Player Score: 15 (Usable Ace: False), Dealer Score: 7\n", 151 | "Taking action: Hit\n", 152 | "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n", 153 | "Taking action: Stick\n", 154 | "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n", 155 | "Game end. Reward: 1.0\n", 156 | "\n", 157 | "Player Score: 15 (Usable Ace: False), Dealer Score: 8\n", 158 | "Taking action: Hit\n", 159 | "Player Score: 23 (Usable Ace: False), Dealer Score: 8\n", 160 | "Game end. Reward: -1.0\n", 161 | "\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "def print_observation(observation):\n", 167 | " score, dealer_score, usable_ace = observation\n", 168 | " print(\"Player Score: {} (Usable Ace: {}), Dealer Score: {}\".format(\n", 169 | " score, usable_ace, dealer_score))\n", 170 | "\n", 171 | "def strategy(observation):\n", 172 | " score, dealer_score, usable_ace = observation\n", 173 | " # Stick (action 0) if the score is > 20, hit (action 1) otherwise\n", 174 | " return 0 if score >= 20 else 1\n", 175 | "\n", 176 | "for i_episode in range(20):\n", 177 | " observation = env.reset()\n", 178 | " for t in range(100):\n", 179 | " print_observation(observation)\n", 180 | " action = strategy(observation)\n", 181 | " print(\"Taking action: {}\".format( [\"Stick\", \"Hit\"][action]))\n", 182 | " observation, reward, done, _ = env.step(action)\n", 183 | " if done:\n", 184 | " print_observation(observation)\n", 185 | " print(\"Game end. Reward: {}\\n\".format(float(reward)))\n", 186 | " break" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.6.4" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 1 218 | } 219 | -------------------------------------------------------------------------------- /DP/Policy Iteration Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pprint\n", 11 | "import sys\n", 12 | "if \"../\" not in sys.path:\n", 13 | " sys.path.append(\"../\") \n", 14 | "from lib.envs.gridworld import GridworldEnv" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "pp = pprint.PrettyPrinter(indent=2)\n", 24 | "env = GridworldEnv()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# Taken from Policy Evaluation Exercise!\n", 34 | "\n", 35 | "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n", 36 | " \"\"\"\n", 37 | " Evaluate a policy given an environment and a full description of the environment's dynamics.\n", 38 | " \n", 39 | " Args:\n", 40 | " policy: [S, A] shaped matrix representing the policy.\n", 41 | " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", 42 | " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", 43 | " env.nS is a number of states in the environment. \n", 44 | " env.nA is a number of actions in the environment.\n", 45 | " theta: We stop evaluation once our value function change is less than theta for all states.\n", 46 | " discount_factor: Gamma discount factor.\n", 47 | " \n", 48 | " Returns:\n", 49 | " Vector of length env.nS representing the value function.\n", 50 | " \"\"\"\n", 51 | " # Start with a random (all 0) value function\n", 52 | " V = np.zeros(env.nS)\n", 53 | " while True:\n", 54 | " delta = 0\n", 55 | " # For each state, perform a \"full backup\"\n", 56 | " for s in range(env.nS):\n", 57 | " v = 0\n", 58 | " # Look at the possible next actions\n", 59 | " for a, action_prob in enumerate(policy[s]):\n", 60 | " # For each action, look at the possible next states...\n", 61 | " for prob, next_state, reward, done in env.P[s][a]:\n", 62 | " # Calculate the expected value\n", 63 | " v += action_prob * prob * (reward + discount_factor * V[next_state])\n", 64 | " # How much our value function changed (across any states)\n", 65 | " delta = max(delta, np.abs(v - V[s]))\n", 66 | " V[s] = v\n", 67 | " # Stop evaluating once our value function change is below a threshold\n", 68 | " if delta < theta:\n", 69 | " break\n", 70 | " return np.array(V)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n", 80 | " \"\"\"\n", 81 | " Policy Improvement Algorithm. Iteratively evaluates and improves a policy\n", 82 | " until an optimal policy is found.\n", 83 | " \n", 84 | " Args:\n", 85 | " env: The OpenAI environment.\n", 86 | " policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n", 87 | " policy, env, discount_factor.\n", 88 | " discount_factor: gamma discount factor.\n", 89 | " \n", 90 | " Returns:\n", 91 | " A tuple (policy, V). \n", 92 | " policy is the optimal policy, a matrix of shape [S, A] where each state s\n", 93 | " contains a valid probability distribution over actions.\n", 94 | " V is the value function for the optimal policy.\n", 95 | " \n", 96 | " \"\"\"\n", 97 | "\n", 98 | " def one_step_lookahead(state, V):\n", 99 | " \"\"\"\n", 100 | " Helper function to calculate the value for all action in a given state.\n", 101 | " \n", 102 | " Args:\n", 103 | " state: The state to consider (int)\n", 104 | " V: The value to use as an estimator, Vector of length env.nS\n", 105 | " \n", 106 | " Returns:\n", 107 | " A vector of length env.nA containing the expected value of each action.\n", 108 | " \"\"\"\n", 109 | " A = np.zeros(env.nA)\n", 110 | " for a in range(env.nA):\n", 111 | " for prob, next_state, reward, done in env.P[state][a]:\n", 112 | " A[a] += prob * (reward + discount_factor * V[next_state])\n", 113 | " return A\n", 114 | " \n", 115 | " # Start with a random policy\n", 116 | " policy = np.ones([env.nS, env.nA]) / env.nA\n", 117 | " \n", 118 | " while True:\n", 119 | " # Evaluate the current policy\n", 120 | " V = policy_eval_fn(policy, env, discount_factor)\n", 121 | " \n", 122 | " # Will be set to false if we make any changes to the policy\n", 123 | " policy_stable = True\n", 124 | " \n", 125 | " # For each state...\n", 126 | " for s in range(env.nS):\n", 127 | " # The best action we would take under the current policy\n", 128 | " chosen_a = np.argmax(policy[s])\n", 129 | " \n", 130 | " # Find the best action by one-step lookahead\n", 131 | " # Ties are resolved arbitarily\n", 132 | " action_values = one_step_lookahead(s, V)\n", 133 | " best_a = np.argmax(action_values)\n", 134 | " \n", 135 | " # Greedily update the policy\n", 136 | " if chosen_a != best_a:\n", 137 | " policy_stable = False\n", 138 | " policy[s] = np.eye(env.nA)[best_a]\n", 139 | " \n", 140 | " # If the policy is stable we've found an optimal policy. Return it\n", 141 | " if policy_stable:\n", 142 | " return policy, V" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 5, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "Policy Probability Distribution:\n", 155 | "[[1. 0. 0. 0.]\n", 156 | " [0. 0. 0. 1.]\n", 157 | " [0. 0. 0. 1.]\n", 158 | " [0. 0. 1. 0.]\n", 159 | " [1. 0. 0. 0.]\n", 160 | " [1. 0. 0. 0.]\n", 161 | " [1. 0. 0. 0.]\n", 162 | " [0. 0. 1. 0.]\n", 163 | " [1. 0. 0. 0.]\n", 164 | " [1. 0. 0. 0.]\n", 165 | " [0. 1. 0. 0.]\n", 166 | " [0. 0. 1. 0.]\n", 167 | " [1. 0. 0. 0.]\n", 168 | " [0. 1. 0. 0.]\n", 169 | " [0. 1. 0. 0.]\n", 170 | " [1. 0. 0. 0.]]\n", 171 | "\n", 172 | "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n", 173 | "[[0 3 3 2]\n", 174 | " [0 0 0 2]\n", 175 | " [0 0 1 2]\n", 176 | " [0 1 1 0]]\n", 177 | "\n", 178 | "Value Function:\n", 179 | "[ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1. 0.]\n", 180 | "\n", 181 | "Reshaped Grid Value Function:\n", 182 | "[[ 0. -1. -2. -3.]\n", 183 | " [-1. -2. -3. -2.]\n", 184 | " [-2. -3. -2. -1.]\n", 185 | " [-3. -2. -1. 0.]]\n", 186 | "\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "policy, v = policy_improvement(env)\n", 192 | "print(\"Policy Probability Distribution:\")\n", 193 | "print(policy)\n", 194 | "print(\"\")\n", 195 | "\n", 196 | "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n", 197 | "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n", 198 | "print(\"\")\n", 199 | "\n", 200 | "print(\"Value Function:\")\n", 201 | "print(v)\n", 202 | "print(\"\")\n", 203 | "\n", 204 | "print(\"Reshaped Grid Value Function:\")\n", 205 | "print(v.reshape(env.shape))\n", 206 | "print(\"\")\n", 207 | "\n" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 6, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "# Test the value function\n", 217 | "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])\n", 218 | "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [] 227 | } 228 | ], 229 | "metadata": { 230 | "kernelspec": { 231 | "display_name": "Python 3", 232 | "language": "python", 233 | "name": "python3" 234 | }, 235 | "language_info": { 236 | "codemirror_mode": { 237 | "name": "ipython", 238 | "version": 3 239 | }, 240 | "file_extension": ".py", 241 | "mimetype": "text/x-python", 242 | "name": "python", 243 | "nbconvert_exporter": "python", 244 | "pygments_lexer": "ipython3", 245 | "version": "3.6.4" 246 | } 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 1 250 | } 251 | -------------------------------------------------------------------------------- /DP/Value Iteration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pprint\n", 13 | "import sys\n", 14 | "if \"../\" not in sys.path:\n", 15 | " sys.path.append(\"../\") \n", 16 | "from lib.envs.gridworld import GridworldEnv" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 4, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "pp = pprint.PrettyPrinter(indent=2)\n", 28 | "env = GridworldEnv()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 5, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n", 40 | " \"\"\"\n", 41 | " Value Iteration Algorithm.\n", 42 | " \n", 43 | " Args:\n", 44 | " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", 45 | " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", 46 | " env.nS is a number of states in the environment. \n", 47 | " env.nA is a number of actions in the environment.\n", 48 | " theta: We stop evaluation once our value function change is less than theta for all states.\n", 49 | " discount_factor: Gamma discount factor.\n", 50 | " \n", 51 | " Returns:\n", 52 | " A tuple (policy, V) of the optimal policy and the optimal value function. \n", 53 | " \"\"\"\n", 54 | " \n", 55 | "\n", 56 | " V = np.zeros(env.nS)\n", 57 | " policy = np.zeros([env.nS, env.nA])\n", 58 | " \n", 59 | " # Implement!\n", 60 | " return policy, V" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 6, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "Policy Probability Distribution:\n", 73 | "[[ 0. 0. 0. 0.]\n", 74 | " [ 0. 0. 0. 0.]\n", 75 | " [ 0. 0. 0. 0.]\n", 76 | " [ 0. 0. 0. 0.]\n", 77 | " [ 0. 0. 0. 0.]\n", 78 | " [ 0. 0. 0. 0.]\n", 79 | " [ 0. 0. 0. 0.]\n", 80 | " [ 0. 0. 0. 0.]\n", 81 | " [ 0. 0. 0. 0.]\n", 82 | " [ 0. 0. 0. 0.]\n", 83 | " [ 0. 0. 0. 0.]\n", 84 | " [ 0. 0. 0. 0.]\n", 85 | " [ 0. 0. 0. 0.]\n", 86 | " [ 0. 0. 0. 0.]\n", 87 | " [ 0. 0. 0. 0.]\n", 88 | " [ 0. 0. 0. 0.]]\n", 89 | "\n", 90 | "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n", 91 | "[[0 0 0 0]\n", 92 | " [0 0 0 0]\n", 93 | " [0 0 0 0]\n", 94 | " [0 0 0 0]]\n", 95 | "\n", 96 | "Value Function:\n", 97 | "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 98 | "\n", 99 | "Reshaped Grid Value Function:\n", 100 | "[[ 0. 0. 0. 0.]\n", 101 | " [ 0. 0. 0. 0.]\n", 102 | " [ 0. 0. 0. 0.]\n", 103 | " [ 0. 0. 0. 0.]]\n", 104 | "\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "policy, v = value_iteration(env)\n", 110 | "\n", 111 | "print(\"Policy Probability Distribution:\")\n", 112 | "print(policy)\n", 113 | "print(\"\")\n", 114 | "\n", 115 | "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n", 116 | "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n", 117 | "print(\"\")\n", 118 | "\n", 119 | "print(\"Value Function:\")\n", 120 | "print(v)\n", 121 | "print(\"\")\n", 122 | "\n", 123 | "print(\"Reshaped Grid Value Function:\")\n", 124 | "print(v.reshape(env.shape))\n", 125 | "print(\"\")" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 7, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "ename": "AssertionError", 135 | "evalue": "\nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n 0., 0., 0.])\n y: array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])", 136 | "output_type": "error", 137 | "traceback": [ 138 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 139 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 140 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Test the value function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mexpected_v\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtesting\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_array_almost_equal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpected_v\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 141 | "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_almost_equal\u001b[0;34m(x, y, decimal, err_msg, verbose)\u001b[0m\n\u001b[1;32m 914\u001b[0m assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,\n\u001b[1;32m 915\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Arrays are not almost equal to %d decimals'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 916\u001b[0;31m precision=decimal)\n\u001b[0m\u001b[1;32m 917\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 142 | "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_compare\u001b[0;34m(comparison, x, y, err_msg, verbose, header, precision)\u001b[0m\n\u001b[1;32m 735\u001b[0m names=('x', 'y'), precision=precision)\n\u001b[1;32m 736\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcond\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 737\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mAssertionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 738\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 739\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtraceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 143 | "\u001b[0;31mAssertionError\u001b[0m: \nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n 0., 0., 0.])\n y: array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "# Test the value function\n", 149 | "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])\n", 150 | "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.5.2" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 1 175 | } 176 | -------------------------------------------------------------------------------- /DP/Policy Iteration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pprint\n", 13 | "import sys\n", 14 | "if \"../\" not in sys.path:\n", 15 | " sys.path.append(\"../\") \n", 16 | "from lib.envs.gridworld import GridworldEnv" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 6, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "pp = pprint.PrettyPrinter(indent=2)\n", 28 | "env = GridworldEnv()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 7, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "# Taken from Policy Evaluation Exercise!\n", 40 | "\n", 41 | "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n", 42 | " \"\"\"\n", 43 | " Evaluate a policy given an environment and a full description of the environment's dynamics.\n", 44 | " \n", 45 | " Args:\n", 46 | " policy: [S, A] shaped matrix representing the policy.\n", 47 | " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", 48 | " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", 49 | " env.nS is a number of states in the environment. \n", 50 | " env.nA is a number of actions in the environment.\n", 51 | " theta: We stop evaluation once our value function change is less than theta for all states.\n", 52 | " discount_factor: Gamma discount factor.\n", 53 | " \n", 54 | " Returns:\n", 55 | " Vector of length env.nS representing the value function.\n", 56 | " \"\"\"\n", 57 | " # Start with a random (all 0) value function\n", 58 | " V = np.zeros(env.nS)\n", 59 | " while True:\n", 60 | " delta = 0\n", 61 | " # For each state, perform a \"full backup\"\n", 62 | " for s in range(env.nS):\n", 63 | " v = 0\n", 64 | " # Look at the possible next actions\n", 65 | " for a, action_prob in enumerate(policy[s]):\n", 66 | " # For each action, look at the possible next states...\n", 67 | " for prob, next_state, reward, done in env.P[s][a]:\n", 68 | " # Calculate the expected value\n", 69 | " v += action_prob * prob * (reward + discount_factor * V[next_state])\n", 70 | " # How much our value function changed (across any states)\n", 71 | " delta = max(delta, np.abs(v - V[s]))\n", 72 | " V[s] = v\n", 73 | " # Stop evaluating once our value function change is below a threshold\n", 74 | " if delta < theta:\n", 75 | " break\n", 76 | " return np.array(V)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 13, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n", 88 | " \"\"\"\n", 89 | " Policy Improvement Algorithm. Iteratively evaluates and improves a policy\n", 90 | " until an optimal policy is found.\n", 91 | " \n", 92 | " Args:\n", 93 | " env: The OpenAI envrionment.\n", 94 | " policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n", 95 | " policy, env, discount_factor.\n", 96 | " discount_factor: gamma discount factor.\n", 97 | " \n", 98 | " Returns:\n", 99 | " A tuple (policy, V). \n", 100 | " policy is the optimal policy, a matrix of shape [S, A] where each state s\n", 101 | " contains a valid probability distribution over actions.\n", 102 | " V is the value function for the optimal policy.\n", 103 | " \n", 104 | " \"\"\"\n", 105 | " # Start with a random policy\n", 106 | " policy = np.ones([env.nS, env.nA]) / env.nA\n", 107 | " \n", 108 | " while True:\n", 109 | " # Implement this!\n", 110 | " break\n", 111 | " \n", 112 | " return policy, np.zeros(env.nS)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 14, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "Policy Probability Distribution:\n", 125 | "[[ 0.25 0.25 0.25 0.25]\n", 126 | " [ 0.25 0.25 0.25 0.25]\n", 127 | " [ 0.25 0.25 0.25 0.25]\n", 128 | " [ 0.25 0.25 0.25 0.25]\n", 129 | " [ 0.25 0.25 0.25 0.25]\n", 130 | " [ 0.25 0.25 0.25 0.25]\n", 131 | " [ 0.25 0.25 0.25 0.25]\n", 132 | " [ 0.25 0.25 0.25 0.25]\n", 133 | " [ 0.25 0.25 0.25 0.25]\n", 134 | " [ 0.25 0.25 0.25 0.25]\n", 135 | " [ 0.25 0.25 0.25 0.25]\n", 136 | " [ 0.25 0.25 0.25 0.25]\n", 137 | " [ 0.25 0.25 0.25 0.25]\n", 138 | " [ 0.25 0.25 0.25 0.25]\n", 139 | " [ 0.25 0.25 0.25 0.25]\n", 140 | " [ 0.25 0.25 0.25 0.25]]\n", 141 | "\n", 142 | "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n", 143 | "[[0 0 0 0]\n", 144 | " [0 0 0 0]\n", 145 | " [0 0 0 0]\n", 146 | " [0 0 0 0]]\n", 147 | "\n", 148 | "Value Function:\n", 149 | "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 150 | "\n", 151 | "Reshaped Grid Value Function:\n", 152 | "[[ 0. 0. 0. 0.]\n", 153 | " [ 0. 0. 0. 0.]\n", 154 | " [ 0. 0. 0. 0.]\n", 155 | " [ 0. 0. 0. 0.]]\n", 156 | "\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "policy, v = policy_improvement(env)\n", 162 | "print(\"Policy Probability Distribution:\")\n", 163 | "print(policy)\n", 164 | "print(\"\")\n", 165 | "\n", 166 | "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n", 167 | "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n", 168 | "print(\"\")\n", 169 | "\n", 170 | "print(\"Value Function:\")\n", 171 | "print(v)\n", 172 | "print(\"\")\n", 173 | "\n", 174 | "print(\"Reshaped Grid Value Function:\")\n", 175 | "print(v.reshape(env.shape))\n", 176 | "print(\"\")\n", 177 | "\n" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 15, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "ename": "AssertionError", 187 | "evalue": "\nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n 0., 0., 0.])\n y: array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])", 188 | "output_type": "error", 189 | "traceback": [ 190 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 191 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 192 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Test the value function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mexpected_v\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtesting\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_array_almost_equal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpected_v\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 193 | "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_almost_equal\u001b[0;34m(x, y, decimal, err_msg, verbose)\u001b[0m\n\u001b[1;32m 914\u001b[0m assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,\n\u001b[1;32m 915\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Arrays are not almost equal to %d decimals'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 916\u001b[0;31m precision=decimal)\n\u001b[0m\u001b[1;32m 917\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 194 | "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_compare\u001b[0;34m(comparison, x, y, err_msg, verbose, header, precision)\u001b[0m\n\u001b[1;32m 735\u001b[0m names=('x', 'y'), precision=precision)\n\u001b[1;32m 736\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcond\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 737\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mAssertionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 738\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 739\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtraceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 195 | "\u001b[0;31mAssertionError\u001b[0m: \nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n 0., 0., 0.])\n y: array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "# Test the value function\n", 201 | "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])\n", 202 | "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 3", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.5.2" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 1 236 | } 237 | -------------------------------------------------------------------------------- /PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "\n", 13 | "import gym\n", 14 | "import itertools\n", 15 | "import matplotlib\n", 16 | "import numpy as np\n", 17 | "import sys\n", 18 | "import tensorflow as tf\n", 19 | "import collections\n", 20 | "\n", 21 | "import sklearn.pipeline\n", 22 | "import sklearn.preprocessing\n", 23 | "\n", 24 | "if \"../\" not in sys.path:\n", 25 | " sys.path.append(\"../\") \n", 26 | "from lib.envs.cliff_walking import CliffWalkingEnv\n", 27 | "from lib import plotting\n", 28 | "\n", 29 | "from sklearn.kernel_approximation import RBFSampler\n", 30 | "\n", 31 | "matplotlib.style.use('ggplot')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stderr", 41 | "output_type": "stream", 42 | "text": [ 43 | "[2017-06-16 13:11:05,265] Making new env: MountainCarContinuous-v0\n" 44 | ] 45 | }, 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "array([-0.21213569, 0.03012651])" 50 | ] 51 | }, 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "env = gym.envs.make(\"MountainCarContinuous-v0\")\n", 59 | "env.observation_space.sample()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "FeatureUnion(n_jobs=1,\n", 71 | " transformer_list=[('rbf1', RBFSampler(gamma=5.0, n_components=100, random_state=None)), ('rbf2', RBFSampler(gamma=2.0, n_components=100, random_state=None)), ('rbf3', RBFSampler(gamma=1.0, n_components=100, random_state=None)), ('rbf4', RBFSampler(gamma=0.5, n_components=100, random_state=None))],\n", 72 | " transformer_weights=None)" 73 | ] 74 | }, 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "# Feature Preprocessing: Normalize to zero mean and unit variance\n", 82 | "# We use a few samples from the observation space to do this\n", 83 | "observation_examples = np.array([env.observation_space.sample() for x in range(10000)])\n", 84 | "scaler = sklearn.preprocessing.StandardScaler()\n", 85 | "scaler.fit(observation_examples)\n", 86 | "\n", 87 | "# Used to converte a state to a featurizes represenation.\n", 88 | "# We use RBF kernels with different variances to cover different parts of the space\n", 89 | "featurizer = sklearn.pipeline.FeatureUnion([\n", 90 | " (\"rbf1\", RBFSampler(gamma=5.0, n_components=100)),\n", 91 | " (\"rbf2\", RBFSampler(gamma=2.0, n_components=100)),\n", 92 | " (\"rbf3\", RBFSampler(gamma=1.0, n_components=100)),\n", 93 | " (\"rbf4\", RBFSampler(gamma=0.5, n_components=100))\n", 94 | " ])\n", 95 | "featurizer.fit(scaler.transform(observation_examples))" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "def featurize_state(state):\n", 107 | " \"\"\"\n", 108 | " Returns the featurized representation for a state.\n", 109 | " \"\"\"\n", 110 | " scaled = scaler.transform([state])\n", 111 | " featurized = featurizer.transform(scaled)\n", 112 | " return featurized[0]" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 6, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "class PolicyEstimator():\n", 124 | " \"\"\"\n", 125 | " Policy Function approximator. \n", 126 | " \"\"\"\n", 127 | " \n", 128 | " def __init__(self, learning_rate=0.01, scope=\"policy_estimator\"):\n", 129 | " with tf.variable_scope(scope):\n", 130 | " self.state = tf.placeholder(tf.float32, [400], \"state\")\n", 131 | " self.target = tf.placeholder(dtype=tf.float32, name=\"target\")\n", 132 | "\n", 133 | " # This is just linear classifier\n", 134 | " self.mu = tf.contrib.layers.fully_connected(\n", 135 | " inputs=tf.expand_dims(self.state, 0),\n", 136 | " num_outputs=1,\n", 137 | " activation_fn=None,\n", 138 | " weights_initializer=tf.zeros_initializer)\n", 139 | " self.mu = tf.squeeze(self.mu)\n", 140 | " \n", 141 | " self.sigma = tf.contrib.layers.fully_connected(\n", 142 | " inputs=tf.expand_dims(self.state, 0),\n", 143 | " num_outputs=1,\n", 144 | " activation_fn=None,\n", 145 | " weights_initializer=tf.zeros_initializer)\n", 146 | " \n", 147 | " self.sigma = tf.squeeze(self.sigma)\n", 148 | " self.sigma = tf.nn.softplus(self.sigma) + 1e-5\n", 149 | " self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)\n", 150 | " self.action = self.normal_dist._sample_n(1)\n", 151 | " self.action = tf.clip_by_value(self.action, env.action_space.low[0], env.action_space.high[0])\n", 152 | "\n", 153 | " # Loss and train op\n", 154 | " self.loss = -self.normal_dist.log_prob(self.action) * self.target\n", 155 | " # Add cross entropy cost to encourage exploration\n", 156 | " self.loss -= 1e-1 * self.normal_dist.entropy()\n", 157 | " \n", 158 | " self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n", 159 | " self.train_op = self.optimizer.minimize(\n", 160 | " self.loss, global_step=tf.contrib.framework.get_global_step())\n", 161 | " \n", 162 | " def predict(self, state, sess=None):\n", 163 | " sess = sess or tf.get_default_session()\n", 164 | " state = featurize_state(state)\n", 165 | " return sess.run(self.action, { self.state: state })\n", 166 | "\n", 167 | " def update(self, state, target, action, sess=None):\n", 168 | " sess = sess or tf.get_default_session()\n", 169 | " state = featurize_state(state)\n", 170 | " feed_dict = { self.state: state, self.target: target, self.action: action }\n", 171 | " _, loss = sess.run([self.train_op, self.loss], feed_dict)\n", 172 | " return loss" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 7, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "class ValueEstimator():\n", 184 | " \"\"\"\n", 185 | " Value Function approximator. \n", 186 | " \"\"\"\n", 187 | " \n", 188 | " def __init__(self, learning_rate=0.1, scope=\"value_estimator\"):\n", 189 | " with tf.variable_scope(scope):\n", 190 | " self.state = tf.placeholder(tf.float32, [400], \"state\")\n", 191 | " self.target = tf.placeholder(dtype=tf.float32, name=\"target\")\n", 192 | "\n", 193 | " # This is just linear classifier\n", 194 | " self.output_layer = tf.contrib.layers.fully_connected(\n", 195 | " inputs=tf.expand_dims(self.state, 0),\n", 196 | " num_outputs=1,\n", 197 | " activation_fn=None,\n", 198 | " weights_initializer=tf.zeros_initializer)\n", 199 | "\n", 200 | " self.value_estimate = tf.squeeze(self.output_layer)\n", 201 | " self.loss = tf.squared_difference(self.value_estimate, self.target)\n", 202 | "\n", 203 | " self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n", 204 | " self.train_op = self.optimizer.minimize(\n", 205 | " self.loss, global_step=tf.contrib.framework.get_global_step()) \n", 206 | " \n", 207 | " def predict(self, state, sess=None):\n", 208 | " sess = sess or tf.get_default_session()\n", 209 | " state = featurize_state(state)\n", 210 | " return sess.run(self.value_estimate, { self.state: state })\n", 211 | "\n", 212 | " def update(self, state, target, sess=None):\n", 213 | " sess = sess or tf.get_default_session()\n", 214 | " state = featurize_state(state)\n", 215 | " feed_dict = { self.state: state, self.target: target }\n", 216 | " _, loss = sess.run([self.train_op, self.loss], feed_dict)\n", 217 | " return loss" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 15, 223 | "metadata": { 224 | "collapsed": true 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):\n", 229 | " \"\"\"\n", 230 | " Actor Critic Algorithm. Optimizes the policy \n", 231 | " function approximator using policy gradient.\n", 232 | " \n", 233 | " Args:\n", 234 | " env: OpenAI environment.\n", 235 | " estimator_policy: Policy Function to be optimized \n", 236 | " estimator_value: Value function approximator, used as a critic\n", 237 | " num_episodes: Number of episodes to run for\n", 238 | " discount_factor: Time-discount factor\n", 239 | " \n", 240 | " Returns:\n", 241 | " An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.\n", 242 | " \"\"\"\n", 243 | "\n", 244 | " # Keeps track of useful statistics\n", 245 | " stats = plotting.EpisodeStats(\n", 246 | " episode_lengths=np.zeros(num_episodes),\n", 247 | " episode_rewards=np.zeros(num_episodes)) \n", 248 | " \n", 249 | " Transition = collections.namedtuple(\"Transition\", [\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n", 250 | " \n", 251 | " for i_episode in range(num_episodes):\n", 252 | " # Reset the environment and pick the fisrst action\n", 253 | " state = env.reset()\n", 254 | " \n", 255 | " episode = []\n", 256 | " \n", 257 | " # One step in the environment\n", 258 | " for t in itertools.count():\n", 259 | " \n", 260 | " # env.render()\n", 261 | " \n", 262 | " # Take a step\n", 263 | " action = estimator_policy.predict(state)\n", 264 | " next_state, reward, done, _ = env.step(action)\n", 265 | " \n", 266 | " # Keep track of the transition\n", 267 | " episode.append(Transition(\n", 268 | " state=state, action=action, reward=reward, next_state=next_state, done=done))\n", 269 | " \n", 270 | " # Update statistics\n", 271 | " stats.episode_rewards[i_episode] += reward\n", 272 | " stats.episode_lengths[i_episode] = t\n", 273 | " \n", 274 | " # Calculate TD Target\n", 275 | " value_next = estimator_value.predict(next_state)\n", 276 | " td_target = reward + discount_factor * value_next\n", 277 | " td_error = td_target - estimator_value.predict(state)\n", 278 | " \n", 279 | " # Update the value estimator\n", 280 | " estimator_value.update(state, td_target)\n", 281 | " \n", 282 | " # Update the policy estimator\n", 283 | " # using the td error as our advantage estimate\n", 284 | " estimator_policy.update(state, td_error, action)\n", 285 | " \n", 286 | " # Print out which step we're on, useful for debugging.\n", 287 | " print(\"\\rStep {} @ Episode {}/{} ({})\".format(\n", 288 | " t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end=\"\")\n", 289 | "\n", 290 | " if done:\n", 291 | " break\n", 292 | " \n", 293 | " state = next_state\n", 294 | " \n", 295 | " return stats" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 19, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "WARNING:tensorflow:From /Users/dennybritz/venv/py3/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:170: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.\n", 308 | "Instructions for updating:\n", 309 | "Use `tf.global_variables_initializer` instead.\n" 310 | ] 311 | }, 312 | { 313 | "name": "stderr", 314 | "output_type": "stream", 315 | "text": [ 316 | "[2017-06-16 13:31:05,772] From /Users/dennybritz/venv/py3/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:170: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.\n", 317 | "Instructions for updating:\n", 318 | "Use `tf.global_variables_initializer` instead.\n" 319 | ] 320 | }, 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "Step 662 @ Episode 50/50 (65.13252566564918))" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "tf.reset_default_graph()\n", 331 | "\n", 332 | "global_step = tf.Variable(0, name=\"global_step\", trainable=False)\n", 333 | "policy_estimator = PolicyEstimator(learning_rate=0.001)\n", 334 | "value_estimator = ValueEstimator(learning_rate=0.1)\n", 335 | "\n", 336 | "with tf.Session() as sess:\n", 337 | " sess.run(tf.initialize_all_variables())\n", 338 | " # Note, due to randomness in the policy the number of episodes you need varies\n", 339 | " # TODO: Sometimes the algorithm gets stuck, I'm not sure what exactly is happening there.\n", 340 | " stats = actor_critic(env, policy_estimator, value_estimator, 50, discount_factor=0.95)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": { 347 | "collapsed": true 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "plotting.plot_episode_stats(stats, smoothing_window=10)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": { 358 | "collapsed": true 359 | }, 360 | "outputs": [], 361 | "source": [] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": { 367 | "collapsed": true 368 | }, 369 | "outputs": [], 370 | "source": [] 371 | } 372 | ], 373 | "metadata": { 374 | "kernelspec": { 375 | "display_name": "Python 3", 376 | "language": "python", 377 | "name": "python3" 378 | }, 379 | "language_info": { 380 | "codemirror_mode": { 381 | "name": "ipython", 382 | "version": 3 383 | }, 384 | "file_extension": ".py", 385 | "mimetype": "text/x-python", 386 | "name": "python", 387 | "nbconvert_exporter": "python", 388 | "pygments_lexer": "ipython3", 389 | "version": "3.5.2" 390 | } 391 | }, 392 | "nbformat": 4, 393 | "nbformat_minor": 1 394 | } 395 | -------------------------------------------------------------------------------- /DQN/dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.wrappers import Monitor 3 | import itertools 4 | import numpy as np 5 | import os 6 | import random 7 | import sys 8 | import tensorflow as tf 9 | 10 | if "../" not in sys.path: 11 | sys.path.append("../") 12 | 13 | from lib import plotting 14 | from collections import deque, namedtuple 15 | 16 | env = gym.envs.make("Breakout-v0") 17 | 18 | # Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions 19 | VALID_ACTIONS = [0, 1, 2, 3] 20 | 21 | class StateProcessor(): 22 | """ 23 | Processes a raw Atari images. Resizes it and converts it to grayscale. 24 | """ 25 | def __init__(self): 26 | # Build the Tensorflow graph 27 | with tf.variable_scope("state_processor"): 28 | self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8) 29 | self.output = tf.image.rgb_to_grayscale(self.input_state) 30 | self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160) 31 | self.output = tf.image.resize_images( 32 | self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) 33 | self.output = tf.squeeze(self.output) 34 | 35 | def process(self, sess, state): 36 | """ 37 | Args: 38 | sess: A Tensorflow session object 39 | state: A [210, 160, 3] Atari RGB State 40 | 41 | Returns: 42 | A processed [84, 84] state representing grayscale values. 43 | """ 44 | return sess.run(self.output, { self.input_state: state }) 45 | 46 | class Estimator(): 47 | """Q-Value Estimator neural network. 48 | 49 | This network is used for both the Q-Network and the Target Network. 50 | """ 51 | 52 | def __init__(self, scope="estimator", summaries_dir=None): 53 | self.scope = scope 54 | # Writes Tensorboard summaries to disk 55 | self.summary_writer = None 56 | with tf.variable_scope(scope): 57 | # Build the graph 58 | self._build_model() 59 | if summaries_dir: 60 | summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope)) 61 | if not os.path.exists(summary_dir): 62 | os.makedirs(summary_dir) 63 | self.summary_writer = tf.summary.FileWriter(summary_dir) 64 | 65 | def _build_model(self): 66 | """ 67 | Builds the Tensorflow graph. 68 | """ 69 | 70 | # Placeholders for our input 71 | # Our input are 4 RGB frames of shape 160, 160 each 72 | self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X") 73 | # The TD target value 74 | self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y") 75 | # Integer id of which action was selected 76 | self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions") 77 | 78 | X = tf.to_float(self.X_pl) / 255.0 79 | batch_size = tf.shape(self.X_pl)[0] 80 | 81 | # Three convolutional layers 82 | conv1 = tf.contrib.layers.conv2d( 83 | X, 32, 8, 4, activation_fn=tf.nn.relu) 84 | conv2 = tf.contrib.layers.conv2d( 85 | conv1, 64, 4, 2, activation_fn=tf.nn.relu) 86 | conv3 = tf.contrib.layers.conv2d( 87 | conv2, 64, 3, 1, activation_fn=tf.nn.relu) 88 | 89 | # Fully connected layers 90 | flattened = tf.contrib.layers.flatten(conv3) 91 | fc1 = tf.contrib.layers.fully_connected(flattened, 512) 92 | self.predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS)) 93 | 94 | # Get the predictions for the chosen actions only 95 | gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl 96 | self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices) 97 | 98 | # Calculate the loss 99 | self.losses = tf.squared_difference(self.y_pl, self.action_predictions) 100 | self.loss = tf.reduce_mean(self.losses) 101 | 102 | # Optimizer Parameters from original paper 103 | self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) 104 | self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step()) 105 | 106 | # Summaries for Tensorboard 107 | self.summaries = tf.summary.merge([ 108 | tf.summary.scalar("loss", self.loss), 109 | tf.summary.histogram("loss_hist", self.losses), 110 | tf.summary.histogram("q_values_hist", self.predictions), 111 | tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions)) 112 | ]) 113 | 114 | 115 | def predict(self, sess, s): 116 | """ 117 | Predicts action values. 118 | 119 | Args: 120 | sess: Tensorflow session 121 | s: State input of shape [batch_size, 4, 160, 160, 3] 122 | 123 | Returns: 124 | Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 125 | action values. 126 | """ 127 | return sess.run(self.predictions, { self.X_pl: s }) 128 | 129 | def update(self, sess, s, a, y): 130 | """ 131 | Updates the estimator towards the given targets. 132 | 133 | Args: 134 | sess: Tensorflow session object 135 | s: State input of shape [batch_size, 4, 160, 160, 3] 136 | a: Chosen actions of shape [batch_size] 137 | y: Targets of shape [batch_size] 138 | 139 | Returns: 140 | The calculated loss on the batch. 141 | """ 142 | feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a } 143 | summaries, global_step, _, loss = sess.run( 144 | [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss], 145 | feed_dict) 146 | if self.summary_writer: 147 | self.summary_writer.add_summary(summaries, global_step) 148 | return loss 149 | 150 | def copy_model_parameters(sess, estimator1, estimator2): 151 | """ 152 | Copies the model parameters of one estimator to another. 153 | 154 | Args: 155 | sess: Tensorflow session instance 156 | estimator1: Estimator to copy the paramters from 157 | estimator2: Estimator to copy the parameters to 158 | """ 159 | e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)] 160 | e1_params = sorted(e1_params, key=lambda v: v.name) 161 | e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)] 162 | e2_params = sorted(e2_params, key=lambda v: v.name) 163 | 164 | update_ops = [] 165 | for e1_v, e2_v in zip(e1_params, e2_params): 166 | op = e2_v.assign(e1_v) 167 | update_ops.append(op) 168 | 169 | sess.run(update_ops) 170 | 171 | 172 | def make_epsilon_greedy_policy(estimator, nA): 173 | """ 174 | Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon. 175 | 176 | Args: 177 | estimator: An estimator that returns q values for a given state 178 | nA: Number of actions in the environment. 179 | 180 | Returns: 181 | A function that takes the (sess, observation, epsilon) as an argument and returns 182 | the probabilities for each action in the form of a numpy array of length nA. 183 | 184 | """ 185 | def policy_fn(sess, observation, epsilon): 186 | A = np.ones(nA, dtype=float) * epsilon / nA 187 | q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0] 188 | best_action = np.argmax(q_values) 189 | A[best_action] += (1.0 - epsilon) 190 | return A 191 | return policy_fn 192 | 193 | 194 | def deep_q_learning(sess, 195 | env, 196 | q_estimator, 197 | target_estimator, 198 | state_processor, 199 | num_episodes, 200 | experiment_dir, 201 | replay_memory_size=500000, 202 | replay_memory_init_size=50000, 203 | update_target_estimator_every=10000, 204 | discount_factor=0.99, 205 | epsilon_start=1.0, 206 | epsilon_end=0.1, 207 | epsilon_decay_steps=500000, 208 | batch_size=32, 209 | record_video_every=50): 210 | """ 211 | Q-Learning algorithm for off-policy TD control using Function Approximation. 212 | Finds the optimal greedy policy while following an epsilon-greedy policy. 213 | 214 | Args: 215 | sess: Tensorflow Session object 216 | env: OpenAI environment 217 | q_estimator: Estimator object used for the q values 218 | target_estimator: Estimator object used for the targets 219 | state_processor: A StateProcessor object 220 | num_episodes: Number of episodes to run for 221 | experiment_dir: Directory to save Tensorflow summaries in 222 | replay_memory_size: Size of the replay memory 223 | replay_memory_init_size: Number of random experiences to sampel when initializing 224 | the reply memory. 225 | update_target_estimator_every: Copy parameters from the Q estimator to the 226 | target estimator every N steps 227 | discount_factor: Gamma discount factor 228 | epsilon_start: Chance to sample a random action when taking an action. 229 | Epsilon is decayed over time and this is the start value 230 | epsilon_end: The final minimum value of epsilon after decaying is done 231 | epsilon_decay_steps: Number of steps to decay epsilon over 232 | batch_size: Size of batches to sample from the replay memory 233 | record_video_every: Record a video every N episodes 234 | 235 | Returns: 236 | An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. 237 | """ 238 | 239 | Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) 240 | 241 | # The replay memory 242 | replay_memory = [] 243 | 244 | # Keeps track of useful statistics 245 | stats = plotting.EpisodeStats( 246 | episode_lengths=np.zeros(num_episodes), 247 | episode_rewards=np.zeros(num_episodes)) 248 | 249 | # Create directories for checkpoints and summaries 250 | checkpoint_dir = os.path.join(experiment_dir, "checkpoints") 251 | checkpoint_path = os.path.join(checkpoint_dir, "model") 252 | monitor_path = os.path.join(experiment_dir, "monitor") 253 | 254 | if not os.path.exists(checkpoint_dir): 255 | os.makedirs(checkpoint_dir) 256 | if not os.path.exists(monitor_path): 257 | os.makedirs(monitor_path) 258 | 259 | saver = tf.train.Saver() 260 | # Load a previous checkpoint if we find one 261 | latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) 262 | if latest_checkpoint: 263 | print("Loading model checkpoint {}...\n".format(latest_checkpoint)) 264 | saver.restore(sess, latest_checkpoint) 265 | 266 | total_t = sess.run(tf.contrib.framework.get_global_step()) 267 | 268 | # The epsilon decay schedule 269 | epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) 270 | 271 | # The policy we're following 272 | policy = make_epsilon_greedy_policy( 273 | q_estimator, 274 | len(VALID_ACTIONS)) 275 | 276 | # Populate the replay memory with initial experience 277 | print("Populating replay memory...") 278 | state = env.reset() 279 | state = state_processor.process(sess, state) 280 | state = np.stack([state] * 4, axis=2) 281 | for i in range(replay_memory_init_size): 282 | action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)]) 283 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 284 | next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) 285 | next_state = state_processor.process(sess, next_state) 286 | next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) 287 | replay_memory.append(Transition(state, action, reward, next_state, done)) 288 | if done: 289 | state = env.reset() 290 | state = state_processor.process(sess, state) 291 | state = np.stack([state] * 4, axis=2) 292 | else: 293 | state = next_state 294 | 295 | # Record videos 296 | # Use the gym env Monitor wrapper 297 | env = Monitor(env, 298 | directory=monitor_path, 299 | resume=True, 300 | video_callable=lambda count: count % record_video_every ==0) 301 | 302 | for i_episode in range(num_episodes): 303 | 304 | # Save the current checkpoint 305 | saver.save(tf.get_default_session(), checkpoint_path) 306 | 307 | # Reset the environment 308 | state = env.reset() 309 | state = state_processor.process(sess, state) 310 | state = np.stack([state] * 4, axis=2) 311 | loss = None 312 | 313 | # One step in the environment 314 | for t in itertools.count(): 315 | 316 | # Epsilon for this time step 317 | epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] 318 | 319 | # Add epsilon to Tensorboard 320 | episode_summary = tf.Summary() 321 | episode_summary.value.add(simple_value=epsilon, tag="epsilon") 322 | q_estimator.summary_writer.add_summary(episode_summary, total_t) 323 | 324 | # Maybe update the target estimator 325 | if total_t % update_target_estimator_every == 0: 326 | copy_model_parameters(sess, q_estimator, target_estimator) 327 | print("\nCopied model parameters to target network.") 328 | 329 | # Print out which step we're on, useful for debugging. 330 | print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( 331 | t, total_t, i_episode + 1, num_episodes, loss), end="") 332 | sys.stdout.flush() 333 | 334 | # Take a step 335 | action_probs = policy(sess, state, epsilon) 336 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 337 | next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) 338 | next_state = state_processor.process(sess, next_state) 339 | next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) 340 | 341 | # If our replay memory is full, pop the first element 342 | if len(replay_memory) == replay_memory_size: 343 | replay_memory.pop(0) 344 | 345 | # Save transition to replay memory 346 | replay_memory.append(Transition(state, action, reward, next_state, done)) 347 | 348 | # Update statistics 349 | stats.episode_rewards[i_episode] += reward 350 | stats.episode_lengths[i_episode] = t 351 | 352 | # Sample a minibatch from the replay memory 353 | samples = random.sample(replay_memory, batch_size) 354 | states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) 355 | 356 | # Calculate q values and targets (Double DQN) 357 | q_values_next = q_estimator.predict(sess, next_states_batch) 358 | best_actions = np.argmax(q_values_next, axis=1) 359 | q_values_next_target = target_estimator.predict(sess, next_states_batch) 360 | targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \ 361 | discount_factor * q_values_next_target[np.arange(batch_size), best_actions] 362 | 363 | # Perform gradient descent update 364 | states_batch = np.array(states_batch) 365 | loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) 366 | 367 | if done: 368 | break 369 | 370 | state = next_state 371 | total_t += 1 372 | 373 | # Add summaries to tensorboard 374 | episode_summary = tf.Summary() 375 | episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") 376 | episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") 377 | q_estimator.summary_writer.add_summary(episode_summary, total_t) 378 | q_estimator.summary_writer.flush() 379 | 380 | yield total_t, plotting.EpisodeStats( 381 | episode_lengths=stats.episode_lengths[:i_episode+1], 382 | episode_rewards=stats.episode_rewards[:i_episode+1]) 383 | 384 | env.monitor.close() 385 | return stats 386 | 387 | 388 | tf.reset_default_graph() 389 | 390 | # Where we save our checkpoints and graphs 391 | experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id)) 392 | 393 | # Create a glboal step variable 394 | global_step = tf.Variable(0, name='global_step', trainable=False) 395 | 396 | # Create estimators 397 | q_estimator = Estimator(scope="q", summaries_dir=experiment_dir) 398 | target_estimator = Estimator(scope="target_q") 399 | 400 | # State processor 401 | state_processor = StateProcessor() 402 | 403 | with tf.Session() as sess: 404 | sess.run(tf.global_variables_initializer()) 405 | for t, stats in deep_q_learning(sess, 406 | env, 407 | q_estimator=q_estimator, 408 | target_estimator=target_estimator, 409 | state_processor=state_processor, 410 | experiment_dir=experiment_dir, 411 | num_episodes=10000, 412 | replay_memory_size=500000, 413 | replay_memory_init_size=50000, 414 | update_target_estimator_every=10000, 415 | epsilon_start=1.0, 416 | epsilon_end=0.1, 417 | epsilon_decay_steps=500000, 418 | discount_factor=0.99, 419 | batch_size=32): 420 | 421 | print("\nEpisode Reward: {}".format(stats.episode_rewards[-1])) 422 | 423 | --------------------------------------------------------------------------------